Commit 48fe31ce authored by Lukas-Stingl's avatar Lukas-Stingl
Browse files

webscraper in src

parent 06cfa1e5
%% Cell type:code id:0aec3818-50a5-4e40-abfe-09dc0620e6c5 tags:
``` python
import sys
sys.path.insert(1, '..\src')
from additionalFeatures import webscraper
url = ['https://urlaubstage-planen.de/feiertage-deutschland/2020/hessen.htm','https://urlaubstage-planen.de/feiertage-deutschland/2021/hessen.htm']
webscraper(url);
```
%% Cell type:code id:9dabd84d-e26b-4091-9470-c118dae7267d tags:
``` python
```
import requests
from lxml import html
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import csv
from datetime import date, timedelta
def webscraper(url):
Dates = []
Names = []
for i in url:
request = requests.get(i)
html_code = request.content
tree = html.fromstring(html_code)
columnDate = tree.xpath("//*[@id=\"page-content\"]/main/article/section[1]/div/table/thead/tr/th[1]/text()")
columnName = tree.xpath("//*[@id=\"page-content\"]/main/article/section[1]/div/table/thead/tr/th[1]/text()")
columnDateValue = tree.xpath("//*[@id=\"page-content\"]/main/article/section/div/table/tbody/tr/td[1]/text()")
columnNameValue = tree.xpath("//*[@id=\"page-content\"]/main/article/section/div/table/tbody/tr/td[2]/a/text()")
# 0 bis 12 sind die arbeitsfreien Feiertage
Dates.extend(columnDateValue[0:12])
Names.extend(columnNameValue[0:12])
StringDates = [str(s) for s in Dates]
dr = pd.date_range(start='2020-01-01', end='2021-12-31')
df = pd.DataFrame()
df['Date'] = dr
holidays = StringDates = [str(s) for s in Dates]
df['Holiday'] = (df['Date'].isin(holidays)).astype(int)
print(df)
df.to_csv(r'..\data\raw\holidays.csv', index = False)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment