Commit 95194b68 authored by ukuiq's avatar ukuiq
Browse files

saved functionality in src

parent 1ca8e232
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sn
import datetime
```
%% Cell type:code id: tags:
``` python
path_0 = '../data/preprocessed/clusters/0/'
path_1 = '../data/preprocessed/clusters/1/'
path_2 = '../data/preprocessed/clusters/2/'
dfs_c_0 = []
dfs_c_1 = []
dfs_c_2 = []
dfs_c_0_grouped = []
dfs_c_1_grouped = []
dfs_c_2_grouped = []
csv_files_0 = [csv for csv in os.listdir(path_0) if csv.endswith('.csv')]
csv_files_1 = [csv for csv in os.listdir(path_1) if csv.endswith('.csv')]
csv_files_2 = [csv for csv in os.listdir(path_2) if csv.endswith('.csv')]
for file in csv_files_0:
# import DataFrame
df = pd.read_csv(path_0 + file)
if file.startswith('g_'):
dfs_c_0_grouped.append(df)
else:
dfs_c_0.append(df)
for file in csv_files_1:
# import DataFrame
df = pd.read_csv(path_1 + file)
if file.startswith('g_'):
dfs_c_1_grouped.append(df)
else:
dfs_c_1.append(df)
for file in csv_files_2:
# import DataFrame
df = pd.read_csv(path_2 + file)
if file.startswith('g_'):
dfs_c_2_grouped.append(df)
else:
dfs_c_2.append(df)
import sys
from pathlib import Path
# in jupyter (lab / notebook), based on notebook path
module_path = str(Path.cwd().parents[0] / "src")
if module_path not in sys.path:
sys.path.append(module_path)
from preprocessing import import_preprocessed_data, assign_holidays, assign_weather
```
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
%aimport preprocessing
```
%% Cell type:markdown id: tags:
Include all data into DataFrames
%% Cell type:code id: tags:
``` python
dfs_c_0, dfs_c_1, dfs_c_2, dfs_c_0_grouped, dfs_c_1_grouped, dfs_c_2_grouped = import_preprocessed_data() # import all data
```
%% Cell type:markdown id: tags:
Include holiday data
Include holiday data, holiday = 1, no holiday = 0
%% Cell type:code id: tags:
``` python
holiday = '../data/raw/holidays.csv'
df_holiday = pd.read_csv(holiday)
df_holiday
```
%%%% Output: execute_result
Date Holiday
0 2020-01-01 1
1 2020-01-02 0
2 2020-01-03 0
3 2020-01-04 0
4 2020-01-05 1
.. ... ...
726 2021-12-27 0
727 2021-12-28 0
728 2021-12-29 0
729 2021-12-30 0
730 2021-12-31 0
[731 rows x 2 columns]
%% Cell type:code id: tags:
``` python
import datetime
```
%% Cell type:code id: tags:
``` python
def assign_holidays(df, df_holiday):
df['holiday'] = 0
for i in range(0,len(df), 1):
form = "%Y-%m-%d"
d1 = datetime.datetime.strptime(df['time_stamp'][i], form)
#get holiday value
for j in range(0, len(df_holiday), 1):
d2 = datetime.datetime.strptime(df_holiday['Date'][j], form)
if d1 == d2:
df['holiday'][i] = df_holiday['Holiday'][j]
```
%% Cell type:code id: tags:
``` python
for df in dfs_c_0_grouped:
assign_holidays(df, df_holiday)
for df in dfs_c_1_grouped:
assign_holidays(df, df_holiday)
for df in dfs_c_2_grouped:
assign_holidays(df, df_holiday)
```
%%%% Output: stream
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/2432097566.py:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['holiday'][i] = df_holiday['Holiday'][j]
%% Cell type:markdown id: tags:
Include weather data
Include weather data of Frankfurt airport
%% Cell type:code id: tags:
``` python
weather_path = '../data/raw/weather_FrankfurtAirport.csv'
df_weather = pd.read_csv(weather_path)
df_weather
```
%%%% Output: execute_result
date tavg tmin tmax prcp snow wdir wspd wpgt pres tsun
0 2020-05-08 17.5 8.2 24.8 0.0 0 112.0 7.2 27.7 1018.0 616
1 2020-05-09 15.7 12.5 17.4 0.6 0 60.0 11.5 25.2 1013.5 10
2 2020-05-10 16.8 11.7 24.9 2.9 0 129.0 8.6 63.0 1006.5 308
3 2020-05-11 7.2 1.9 15.5 9.0 0 56.0 23.8 61.2 1009.2 38
4 2020-05-12 7.9 1.0 14.2 0.0 0 130.0 8.3 24.1 1018.7 736
.. ... ... ... ... ... ... ... ... ... ... ...
361 2021-05-04 10.5 8.0 13.7 0.0 0 215.0 32.4 77.8 1006.5 67
362 2021-05-05 7.9 4.1 11.4 0.7 0 248.0 24.1 65.5 1008.7 422
363 2021-05-06 6.9 3.2 12.1 5.9 0 174.0 11.5 34.9 1009.1 46
364 2021-05-07 7.5 1.5 12.1 0.7 0 273.0 14.4 59.4 1015.9 654
365 2021-05-08 10.0 -0.7 17.5 0.0 0 169.0 9.4 38.2 1017.6 620
[366 rows x 11 columns]
%% Cell type:code id: tags:
``` python
def assign_weather(df, df_weather):
df['temp avg'] = 0.0
df['temp min'] = 0.0
df['temp max'] = 0.0
df['rainfall sum'] = 0.0
df['snowfall sum'] = 0.0
df['sunshine minutes'] = 0
for i in range(0,len(df), 1):
form = "%Y-%m-%d"
d1 = datetime.datetime.strptime(df['time_stamp'][i], form)
#get weather values
for j in range(0, len(df_weather), 1):
d2 = datetime.datetime.strptime(df_weather['date'][j], form)
if d1 == d2:
df['temp avg'][i] = df_weather['tavg'][j]
df['temp min'][i] = df_weather['tmin'][j]
df['temp max'][i] = df_weather['tmax'][j]
df['rainfall sum'][i] = df_weather['prcp'][j]
df['snowfall sum'][i] = df_weather['snow'][j]
df['sunshine minutes'][i] = df_weather['tsun'][j]
```
%% Cell type:code id: tags:
``` python
# assign to all DataFrames
for df in dfs_c_0_grouped:
assign_weather(df, df_weather)
for df in dfs_c_1_grouped:
assign_weather(df, df_weather)
for df in dfs_c_2_grouped:
assign_weather(df, df_weather)
```
%%%% Output: stream
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/777759664.py:17: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['temp avg'][i] = df_weather['tavg'][j]
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/777759664.py:18: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['temp min'][i] = df_weather['tmin'][j]
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/777759664.py:19: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['temp max'][i] = df_weather['tmax'][j]
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/777759664.py:20: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['rainfall sum'][i] = df_weather['prcp'][j]
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/777759664.py:21: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['snowfall sum'][i] = df_weather['snow'][j]
C:\Users\hendr\AppData\Local\Temp/ipykernel_8496/777759664.py:22: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df['sunshine minutes'][i] = df_weather['tsun'][j]
%% Cell type:code id: tags:
``` python
# Loading the dataset
# Loading one container dataset
data = dfs_c_0_grouped[0]
# Numeric columns of the dataset
numeric_col = ['inter_pol','Temperature','Tilt','holiday','temp avg','temp min','temp max','rainfall sum','snowfall sum','sunshine minutes']
# Correlation Matrix formation
corr_matrix = data.loc[:,numeric_col].corr()
print(corr_matrix)
#Using heatmap to visualize the correlation matrix
sn.heatmap(corr_matrix, annot=True)
```
%%%% Output: stream
inter_pol Temperature Tilt holiday temp avg \
inter_pol 1.000000 0.272704 0.066792 0.036456 0.330773
Temperature 0.272704 1.000000 -0.253476 -0.101045 0.954478
Tilt 0.066792 -0.253476 1.000000 0.060309 -0.241740
holiday 0.036456 -0.101045 0.060309 1.000000 -0.109564
temp avg 0.330773 0.954478 -0.241740 -0.109564 1.000000
temp min 0.352179 0.858892 -0.164456 -0.095769 0.947680
temp max 0.289772 0.963847 -0.285066 -0.107842 0.976731
rainfall sum 0.059159 -0.054447 0.060050 -0.043965 0.011288
snowfall sum -0.051326 -0.223033 0.174676 -0.032307 -0.270178
sunshine minutes 0.110113 0.648934 -0.234403 -0.005993 0.495271
temp min temp max rainfall sum snowfall sum \
inter_pol 0.352179 0.289772 0.059159 -0.051326
Temperature 0.858892 0.963847 -0.054447 -0.223033
Tilt -0.164456 -0.285066 0.060050 0.174676
holiday -0.095769 -0.107842 -0.043965 -0.032307
temp avg 0.947680 0.976731 0.011288 -0.270178
temp min 1.000000 0.871725 0.097316 -0.246456
temp max 0.871725 1.000000 -0.025114 -0.270493
rainfall sum 0.097316 -0.025114 1.000000 -0.016413
snowfall sum -0.246456 -0.270493 -0.016413 1.000000
sunshine minutes 0.272622 0.610498 -0.252725 -0.126974
sunshine minutes
inter_pol 0.110113
Temperature 0.648934
Tilt -0.234403
holiday -0.005993
temp avg 0.495271
temp min 0.272622
temp max 0.610498
rainfall sum -0.252725
snowfall sum -0.126974
sunshine minutes 1.000000
%%%% Output: execute_result
<AxesSubplot:>
%%%% Output: display_data
![]()
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
%% Cell type:markdown id: tags:
``` python
```
The matrix shows, that there is poor correlation between the features and the target (inter_pol). Therefore, the features will not be used further.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
```
import sys
sys.path.append('../src')
import preprocessing
```
%%%% Output: stream
/Users/davidblumenthal/opt/miniconda3/envs/bda/lib/python3.9/site-packages/tslearn/bases/bases.py:15: UserWarning: h5py not installed, hdf5 features will not be supported.
Install h5py to use hdf5 features: http://docs.h5py.org/
warn(h5py_msg)
%% Cell type:code id: tags:
```
```
import pandas as pd
import numpy as np
# function uses the model and the initial input set to predict a given number of following values
def predict_values(model, initial_set, values, window_size):
result = []
for i in range(0, values):
x_input = initial_set.reshape((1, window_size, 1))
yhat = model.predict(x_input, verbose=0)
result.append(yhat[0][0])
# update the model input for the next prediction
initial_set = np.append(initial_set, yhat)
initial_set = np.delete(initial_set, 0)
return result
# function uses the model and the initial input set to predict a given number of following values.
# additionally the gradient is
def predict_values_with_gradient(model, initial_set, initial, values, window_size):
result = []
for i in range(0, values):
x_input = initial_set.reshape((1, window_size, 1))
yhat = model.predict(x_input, verbose=0)
result.append(yhat[0][0])
# update the model input for the next prediction
initial_set = np.append(initial_set, yhat)
initial_set = np.delete(initial_set, 0)
# calculate the gradient of the last values. If it falls under a threshold,
# the input data will be reseted to 140.
gradients = np.gradient(initial_set)
if ((sum(gradients) / len(gradients))) > -1 and (initial_set[window_size-1] < 20):
initial_set = initial
return result
......@@ -2,6 +2,10 @@ import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import gc
from mat4py import loadmat
import json
import datetime
# function loads the preprocessed data into DataFrames.
def import_preprocessed_data():
......@@ -75,3 +79,86 @@ def add_empties_column(df, values):
df_final['empties'][i] = 1
return df_final
# function parses JSON data into csv file format. Only the important information is saved in csv
def create_csv_files():
path_to_json = '../data/raw/data/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.txt')]
csv_folder = '../data/preprocessed/CSV/'
for filename in json_files:
f = open(path_to_json + filename,)
data = json.load(f)
f.close()
json_objects = data[1]
# create important columns
columns = ['id','deveui', 'unix_time', 'client_id', 'created_at', 'Status', 'Sensor ID', 'Events', 'Height', 'Voltage', 'Temperature', 'Tilt', 'Tx Event', 'Messagetype']
df = pd.DataFrame(columns=columns)
# import all json data into dataframes
for i in range(0, len(json_objects)):
new_data = {'id': int(json_objects[i]['id']),'deveui':str(json_objects[i]['deveui']), 'unix_time':int(json_objects[i]['unix_time']), 'client_id':str(json_objects[i]['client_id']), 'created_at':str(json_objects[i]['created_at']), 'Status':json_objects[i]['decoded_data']['sensor_data']['Status'], 'Sensor ID':str(json_objects[i]['decoded_data']['sensor_data']['Sensor ID']), 'Events':str(json_objects[i]['decoded_data']['sensor_data']['Events']), 'Height':str(json_objects[i]['decoded_data']['sensor_data']['Height 1']), 'Voltage':str(json_objects[i]['decoded_data']['sensor_data']['Voltage']), 'Temperature':str(json_objects[i]['decoded_data']['sensor_data']['Temperature']), 'Tilt':str(json_objects[i]['decoded_data']['sensor_data']['Tilt']), 'Tx Event':str(json_objects[i]['decoded_data']['sensor_data']['Tx Event'])}
df = df.append(new_data, ignore_index=True)
df = df[::-1] #reverse values
# parse data into correct data types
df['Height'] = df['Height'].apply(lambda x: str(x).split(' ')[0])
df['Voltage'] = df['Voltage'].apply(lambda x: str(x).split(' ')[0])
df['Temperature'] = df['Temperature'].apply(lambda x: str(x).split(' ')[0])
df['Tilt'] = df['Tilt'].apply(lambda x: str(x).split(' ')[0])
df['Tx Event'] = df['Tx Event'].apply(lambda x: str(x).split(' ')[0])
df['Height'] = df['Height'].astype('int')
df['Voltage'] = df['Voltage'].astype('int')
df['Temperature'] = df['Temperature'].astype('int')
df['Tilt'] = df['Tilt'].astype('int')
df['Tx Event'] = df['Tx Event'].astype('int')
# save DataFrames in csv files
filename = df['deveui'][0]
filename = filename + ".csv"
df.to_csv(csv_folder + filename)
# function will assign holiday value the given container data frame
def assign_holidays(df, df_holiday):
df['holiday'] = 0
for i in range(0,len(df), 1):
form = "%Y-%m-%d"
d1 = datetime.datetime.strptime(df['time_stamp'][i], form)
#get holiday value
for j in range(0, len(df_holiday), 1):
d2 = datetime.datetime.strptime(df_holiday['Date'][j], form)
if d1 == d2:
df['holiday'][i] = df_holiday['Holiday'][j]
# function will assign weather data of Frankfurt airport to container data frame
def assign_weather(df, df_weather):
df['temp avg'] = 0.0
df['temp min'] = 0.0
df['temp max'] = 0.0
df['rainfall sum'] = 0.0
df['snowfall sum'] = 0.0
df['sunshine minutes'] = 0
for i in range(0,len(df), 1):
form = "%Y-%m-%d"
d1 = datetime.datetime.strptime(df['time_stamp'][i], form)
#get weather values
for j in range(0, len(df_weather), 1):
d2 = datetime.datetime.strptime(df_weather['date'][j], form)
if d1 == d2:
df['temp avg'][i] = df_weather['tavg'][j]
df['temp min'][i] = df_weather['tmin'][j]
df['temp max'][i] = df_weather['tmax'][j]
df['rainfall sum'][i] = df_weather['prcp'][j]
df['snowfall sum'][i] = df_weather['snow'][j]
df['sunshine minutes'][i] = df_weather['tsun'][j]
\ No newline at end of file
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
# function will take container data as input and return LSTM ready input format data.
# The window length and overlap can be set and a timeseries generator is used.
def create_windows_smoothed(dfs, length, batch_size, stride):
features = []
targets = []
X = []
y = []
# Height as feature and target
for df in dfs:
height = df['inter_pol'].to_numpy().tolist()
# apply TimeSeriesGenerator
ts_generator = TimeseriesGenerator(height,height,length=length, batch_size=batch_size, stride=stride)
for j in range(len(ts_generator)):
features.append(ts_generator[j][0])
targets.append(ts_generator[j][1])
#reshape data for neural network
for i in range(len(features)):
x = np.reshape(features[i], (length,1))
X.append(x)
X = np.array(X)
y = np.array(targets)
return X, y
# function creates train/validation/test split on data
def create_train_val_test_split(X,y):
#Split data into train & test set & validation set
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_tr, y_tr, test_size=0.2)
return X_train, y_train, X_test, y_test, X_val, y_val
\ No newline at end of file