Commit 48fe31ce authored by Lukas-Stingl's avatar Lukas-Stingl
Browse files

webscraper in src

parent 06cfa1e5
%% Cell type:code id:0aec3818-50a5-4e40-abfe-09dc0620e6c5 tags:
``` python
import sys
sys.path.insert(1, '..\src')
from additionalFeatures import webscraper
url = ['https://urlaubstage-planen.de/feiertage-deutschland/2020/hessen.htm','https://urlaubstage-planen.de/feiertage-deutschland/2021/hessen.htm']
webscraper(url);
```
%% Output
Date Holiday
0 2020-01-01 1
1 2020-01-02 0
2 2020-01-03 0
3 2020-01-04 0
4 2020-01-05 1
.. ... ...
726 2021-12-27 0
727 2021-12-28 0
728 2021-12-29 0
729 2021-12-30 0
730 2021-12-31 0
[731 rows x 2 columns]
%% Cell type:code id:9dabd84d-e26b-4091-9470-c118dae7267d tags:
``` python
```
import requests
from lxml import html
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import csv
from datetime import date, timedelta
def webscraper(url):
Dates = []
Names = []
for i in url:
request = requests.get(i)
html_code = request.content
tree = html.fromstring(html_code)
columnDate = tree.xpath("//*[@id=\"page-content\"]/main/article/section[1]/div/table/thead/tr/th[1]/text()")
columnName = tree.xpath("//*[@id=\"page-content\"]/main/article/section[1]/div/table/thead/tr/th[1]/text()")
columnDateValue = tree.xpath("//*[@id=\"page-content\"]/main/article/section/div/table/tbody/tr/td[1]/text()")
columnNameValue = tree.xpath("//*[@id=\"page-content\"]/main/article/section/div/table/tbody/tr/td[2]/a/text()")
# 0 bis 12 sind die arbeitsfreien Feiertage
Dates.extend(columnDateValue[0:12])
Names.extend(columnNameValue[0:12])
StringDates = [str(s) for s in Dates]
dr = pd.date_range(start='2020-01-01', end='2021-12-31')
df = pd.DataFrame()
df['Date'] = dr
holidays = StringDates = [str(s) for s in Dates]
df['Holiday'] = (df['Date'].isin(holidays)).astype(int)
print(df)
df.to_csv(r'..\data\raw\holidays.csv', index = False)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment