Commit 3b236a48 authored by tills's avatar tills
Browse files
parents eb7fcb6a 10f21c74
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import os
```
%% Cell type:code id: tags:
``` python
# File Path of raw data
file_data=r'..\data\raw'
data_name=os.listdir(file_data)
write_to_path = r'..\data\preprocessed'
path_data = r'..\..\data'
path_raw = r'..\..\data\raw'
path_preprocessed = r'..\..\data\preprocessed'
path_additional = r'..\..\data\additional data'
```
%% Cell type:code id: tags:
``` python
# Loads the raw json file for a specific container
def load_data(container):
file_number=file_data+'\\'+container+'.txt'
file_number=path_raw+'\\'+container+'.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
```
%% Cell type:code id: tags:
``` python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
```
%% Cell type:code id: tags:
``` python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 1 (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 2 (cm)"] = df_decoded_data["sensor_data.Height 2"].str.replace("cm","").astype(int)
data["Height 3 (cm)"] = df_decoded_data["sensor_data.Height 3"].str.replace("cm","").astype(int)
data["Height 4 (cm)"] = df_decoded_data["sensor_data.Height 4"].str.replace("cm","").astype(int)
data["Height (cm)"] = 140 - df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
```
%% Cell type:code id: tags:
``` python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
for idx, row in data[(data['Height (cm)'] < 0)].iterrows():
data.at[idx, 'Height (cm)'] = 0
return data
```
%% Cell type:code id: tags:
``` python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=write_to_path+'/'+container+'.txt')
data.to_csv(path_or_buf=path_preprocessed+'/'+container+'.txt')
```
%% Cell type:code id: tags:
``` python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
created_at = row['unix_time']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
```
%% Cell type:code id: tags:
``` python
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
weather_data
```
%%%% Output: execute_result
MESS_DATUM Temperatur Relative Feuchte Niederschlagsmenge \
0 2020-04-01 00:00:00 -0.6 57.0 0.0
1 2020-04-01 01:00:00 -1.0 58.0 0.0
2 2020-04-01 02:00:00 0.5 54.0 0.0
3 2020-04-01 03:00:00 -1.1 62.0 0.0
4 2020-04-01 04:00:00 -0.1 58.0 0.0
... ... ... ... ...
10219 2021-05-31 19:00:00 19.0 40.0 0.0
10220 2021-05-31 20:00:00 16.2 49.0 0.0
10221 2021-05-31 21:00:00 13.4 60.0 0.0
10222 2021-05-31 22:00:00 12.2 66.0 0.0
10223 2021-05-31 23:00:00 12.3 68.0 0.0
Feiertag Wochentag Frei[1/0]
0 0 Mittwoch 0
1 0 Mittwoch 0
2 0 Mittwoch 0
3 0 Mittwoch 0
4 0 Mittwoch 0
... ... ... ...
10219 0 Montag 0
10220 0 Montag 0
10221 0 Montag 0
10222 0 Montag 0
10223 0 Montag 0
[10224 rows x 7 columns]
%% Cell type:code id: tags:
``` python
def get_collection_data(data, collections):
collections_data = []
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
weather_data = pd.read_excel(path_additional + '\\additional_data.xlsx')
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Weather Data
weather_data_since_last_collection = weather_data[(weather_data['MESS_DATUM'] <= collection_time) & (weather_data['MESS_DATUM'] >= last_collection_time)]
holiday_count = len(weather_data_since_last_collection[weather_data_since_last_collection['Frei[1/0]']==1].index)/len(weather_data_since_last_collection.index)
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference.days
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['sensor_mean_temperature'] = data_since_last_collection['Temperature (C)'].mean()
collection_data['sensor_max_temperature'] = data_since_last_collection['Temperature (C)'].max()
collection_data['sensor_min_temperature'] = data_since_last_collection['Temperature (C)'].min()
collection_data['weather_mean_temperature'] = weather_data_since_last_collection['Temperatur'].mean()
collection_data['weather_max_temperature'] = weather_data_since_last_collection['Temperatur'].max()
collection_data['weather_min_temperature'] = weather_data_since_last_collection['Temperatur'].min()
collection_data['weather_mean_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].mean()
collection_data['weather_max_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].max()
collection_data['weather_min_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].min()
collection_data['weather_mean_moisture'] = weather_data_since_last_collection['Relative Feuchte'].mean()
collection_data['weather_max_moisture'] = weather_data_since_last_collection['Relative Feuchte'].max()
collection_data['weather_min_moisture'] = weather_data_since_last_collection['Relative Feuchte'].min()
collection_data['holiday_percentage'] = holiday_count
collection_data['year'] = collection_time.year
collection_data['month'] = collection_time.month
collection_data['weekday'] = collection_time.weekday()
collections_data.append(collection_data)
return collections_data
```
%% Cell type:code id: tags:
``` python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df = pd.DataFrame()
for file in data_name:
containers = os.listdir(path_raw)
for file in containers:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
#write_preprocessed_data(container_id, preprocessed)
write_preprocessed_data(container_id, preprocessed)
```
%% Cell type:code id: tags:
``` python
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=write_to_path+'/collection_data.txt')
collection_df.to_csv(path_or_buf=path_data + '/collection_data.txt')
```
%% Cell type:code id: tags:
``` python
```
......
%% Cell type:code id:1a7e310f tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
```
%% Cell type:code id:bddc2445 tags:
``` python
file=os.getcwd()
file_m=file[0:(len(file)-9)]
file_dz='data'
file_data=file_m+file_dz
data_name=os.listdir(file_data)
data='collection_data'
file_number=file_m+file_dz+'\\'+data+'.txt'
data_path = r'..\..\data'
data='collection_data.txt'
file_number=data_path+'\\'+data
df = pd.read_csv(file_number)
df["collection_intervall"] = list(map(lambda st: str(st)[0:int(st.index("d"))],df["last_collection"]))
df["collection_intervall"]=df["collection_intervall"].astype(int)
df["collection_intervall"]= list(map(lambda z: z*(-1),df["collection_intervall"]))
df["number_collections"]=np.ones(len(df["collection_intervall"]))
df["number_collections"]=df["number_collections"].astype(int)
```
%% Cell type:markdown id:6e02f3a6 tags:
Hinzufügen der Information der geschätzten Anzahl der Leerungen jedes Conatainers
%% Cell type:code id:aab3daa4 tags:
``` python
number_collections=pd.DataFrame({'container_id':df["container_id"],'number_collections':df['number_collections']})
number_collections=number_collections.groupby(['container_id']).sum()
number_collections
```
%%%% Output: execute_result
number_collections
container_id
70B3D500700016DA 20
70B3D500700016DE 11
70B3D500700016DF 46
70B3D500700016E0 38
70B3D500700016E5 18
... ...
70B3D50070001782 138
70B3D50070001786 9
70B3D50070001787 81
70B3D50070001788 44
70B3D50070001789 99
[76 rows x 1 columns]
%% Cell type:code id:6bba9c37 tags:
``` python
relevant_data=pd.DataFrame({'container_id':df["container_id"],'collection_intervall':df["collection_intervall"],
'pre_height':df["pre_height"],'post_height':df["post_height"],
'sensor_mean_temperature':df["sensor_mean_temperature"],'lockdown':df['Lockdown']})
data_all=relevant_data.groupby(['container_id']).mean()
data_all['number_collections']=number_collections['number_collections']
data_all
```
%%%% Output: execute_result
collection_intervall pre_height post_height \
container_id
70B3D500700016DA 18.450000 123.700000 12.400000
70B3D500700016DE 27.545455 93.090909 34.000000
70B3D500700016DF 7.652174 72.956522 17.739130
70B3D500700016E0 9.815789 53.578947 2.526316
70B3D500700016E5 19.444444 60.222222 6.555556
... ... ... ...
70B3D50070001782 3.195652 68.898551 21.188406
70B3D50070001786 14.777778 64.888889 28.666667
70B3D50070001787 4.962963 77.753086 16.913580
70B3D50070001788 8.500000 68.363636 10.727273
70B3D50070001789 4.222222 73.575758 11.676768
sensor_mean_temperature lockdown number_collections
container_id
70B3D500700016DA 9.618631 0.550000 20
70B3D500700016DE 11.847655 0.909091 11
70B3D500700016DF 12.407369 0.652174 46
70B3D500700016E0 18.086524 0.578947 38
70B3D500700016E5 10.636161 0.555556 18
... ... ... ...
70B3D50070001782 23.158873 0.275362 138
70B3D50070001786 22.170899 0.000000 9
70B3D50070001787 7.347588 0.728395 81
70B3D50070001788 11.766303 0.522727 44
70B3D50070001789 18.541976 0.434343 99
[76 rows x 6 columns]
%% Cell type:markdown id:63323896 tags:
Rausfiltern aller "Ausreißer Container", damit Modelling nicht verfälscht wird. <br>
"Ausreißer Container" sind definiert als alle Contaner, die mehr als 40 Leerungen haben oder im Durchschnitt ein Leerungsintervall von über 75 Tage. Schwellwerte wurden anhand der Visualisierung des Clustering Notebooks ausgewählt.
%% Cell type:code id:4e74d2b7 tags:
``` python
#Schwellwerte
T_colecction_intervall=75
T_number_collections=40
data_all=data_all[data_all['collection_intervall']<=T_colecction_intervall]
data_all=data_all[data_all['number_collections']<=T_number_collections]
data_all=data_all.reset_index()
container=list(data_all['container_id'])
container
```
%%%% Output: execute_result
['70B3D500700016DA',
'70B3D500700016DE',
'70B3D500700016E0',
'70B3D500700016E5',
'70B3D500700016E7',
'70B3D500700016EB',
'70B3D500700016F1',
'70B3D500700016F4',
'70B3D500700016FA',
'70B3D50070001700',
'70B3D50070001706',
'70B3D50070001709',
'70B3D50070001710',
'70B3D50070001716',
'70B3D50070001725',
'70B3D50070001726',
'70B3D50070001727',
'70B3D5007000172B',
'70B3D5007000172C',
'70B3D5007000172D',
'70B3D5007000172E',
'70B3D50070001734',
'70B3D50070001737',
'70B3D50070001738',
'70B3D5007000173A',
'70B3D5007000173C',
'70B3D50070001740',
'70B3D50070001742',
'70B3D50070001747',
'70B3D5007000174F',
'70B3D50070001770',
'70B3D50070001779',
'70B3D50070001780',
'70B3D50070001786']
%% Cell type:code id:01762362 tags:
``` python
train_data=df[df['container_id']==container[0]]
for item in container:
if item == container[0]:
None
else:
train_data=train_data.append(df[df['container_id']==item])
```
%% Cell type:code id:92cd50bb tags:
``` python
train_data
```
%%%% Output: execute_result
Unnamed: 0 timestamp container_id \
0 0 2020-05-22 18:51:01.742945 70B3D500700016DA
1 1 2020-06-05 14:49:42.681218 70B3D500700016DA
2 2 2020-06-29 13:47:52.050553 70B3D500700016DA
3 3 2020-07-17 13:46:18.287249 70B3D500700016DA
4 4 2020-08-07 09:44:36.149679 70B3D500700016DA
... ... ... ...
4381 4381 2020-08-08 15:42:32.866709 70B3D50070001786
4382 4382 2020-08-09 15:42:30.118122 70B3D50070001786
4383 4383 2020-08-11 12:42:24.962069 70B3D50070001786
4384 4384 2020-09-07 13:40:11.695782 70B3D50070001786
4385 4385 2020-09-14 15:39:33.709211 70B3D50070001786
last_collection pre_height post_height \
0 -14 days +06:00:58.208000 136 16
1 -14 days +04:01:19.058000 120 14
2 -24 days +01:01:50.633000 136 14
3 -18 days +00:01:33.806000 128 12
4 -21 days +04:01:42.126000 118 14
... ... ... ...
4381 -2 days +21:00:01.687000 64 28
4382 -1 days +00:00:02.748000 60 24
4383 -2 days +03:00:05.138000 70 30
4384 -28 days +23:02:13.362000 62 30
4385 -8 days +22:00:37.993000 64 28
sensor_mean_temperature sensor_max_temperature sensor_min_temperature \
0 15.251029 47 0
1 16.410714 44 4
2 18.255446 43 4
3 19.053476 45 7
4 21.981524 47 6
... ... ... ...
4381 33.296296 59 16
4382 32.217391 60 15
4383 28.121951 60 15
4384 20.341463 55 7
4385 20.869281 44 6
weather_mean_temperature ... weather_mean_moisture \
0 14.283636 ... 58.121212
1 16.873193 ... 53.888554
2 18.670261 ... 65.890435
3 19.258796 ... 58.773148
4 21.973000 ... 49.794000
... ... ... ...
4381 28.659259 ... 40.925926
4382 28.104167 ... 44.416667
4383 28.011111 ... 47.222222
4384 19.830663 ... 67.885978
4385 18.970000 ... 58.441176
weather_max_moisture weather_min_moisture holiday_percentage \
0 95.0 25.0 0.360606
1 93.0 19.0 0.361446
2 97.0 25.0 0.375652
3 96.0 22.0 0.222222
4 95.0 20.0 0.288000
... ... ... ...
4381 64.0 23.0 0.592593
4382 75.0 21.0 1.000000
4383 69.0 23.0 0.177778
4384 98.0 28.0 0.295840
4385 94.0 26.0 0.282353
Lockdown year month weekday collection_intervall number_collections
0 0.0 2020 5 4 14 1
1 0.0 2020 6 4 14 1
2 0.0 2020 6 0 24 1
3 0.0 2020 7 4 18 1
4 0.0 2020 8 4 21 1
... ... ... ... ... ... ...
4381 0.0 2020 8 5 2 1
4382 0.0 2020 8 6 1 1
4383 0.0 2020 8 1 2 1
4384 0.0 2020 9 0 28 1
4385 0.0 2020 9 0 8 1
[632 rows x 25 columns]
%% Cell type:code id:a1dcbff5 tags:
``` python
data_path = r'..\data\modeling\train'
train_data.to_csv(path_or_buf=data_path+'/train_data.txt')
data_path = r'..\..\data\modeling\train'
train_data.to_csv(path_or_buf=data_path+'\\train_data.txt')
```
......
%% Cell type:code id:de2f5927 tags:
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from pandas.io.json import json_normalize
import plotly.graph_objects as go
import os
from ipywidgets import interact, interact_manual
import ipywidgets as widgets
import datetime
from scipy.signal import savgol_filter
```
%% Cell type:code id:047d7eec tags:
%% Cell type:code id: tags:
``` python
file=os.getcwd()
file_m=file[0:(len(file)-9)]
file_dz='data\\preprocessed'#
file_data=file_m+file_dz
file_data
data_name=os.listdir(file_data)
path_preprocessed = r'..\..\data\preprocessed'
data_name=os.listdir(path_preprocessed)
```
%% Cell type:code id:e79fd35b tags:
%% Cell type:code id: tags:
``` python
def load_data(container: str,start,end):
file_number=file_m+file_dz+'\\'+container+'.txt'
file_number=path_preprocessed+'\\'+container+'.txt'
df = pd.read_csv(file_number)
df["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],df["created_at"])))
data=df[df["time"]>=pd.to_datetime(start)]
data=data[data["time"]<=pd.to_datetime(end)]
data=data.reset_index()
data=data.set_index('time')
return data
```
%% Cell type:code id:75698570 tags:
%% Cell type:code id: tags:
``` python
def data_arith_mean(data_g, auflösung: str):