Commit 90d06782 authored by tills's avatar tills
Browse files

Clean up

parent 6df313ae
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
delete me, just to keep the folder in git
\ No newline at end of file
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import os
```
%% Cell type:code id: tags:
``` python
# File Path of raw data
file_data=r'..\data\raw'
data_name=os.listdir(file_data)
write_to_path = r'..\data\preprocessed'
```
%% Cell type:code id: tags:
``` python
# Loads the raw json file for a specific container
def load_data(container):
file_number=file_data+'\\'+container+'.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
```
%% Cell type:code id: tags:
``` python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
```
%% Cell type:code id: tags:
``` python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 1 (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 2 (cm)"] = df_decoded_data["sensor_data.Height 2"].str.replace("cm","").astype(int)
data["Height 3 (cm)"] = df_decoded_data["sensor_data.Height 3"].str.replace("cm","").astype(int)
data["Height 4 (cm)"] = df_decoded_data["sensor_data.Height 4"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
```
%% Cell type:code id: tags:
``` python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
return data
```
%% Cell type:code id: tags:
``` python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=write_to_path+'/'+container+'.txt')
```
%% Cell type:code id: tags:
``` python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
created_at = row['unix_time']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
```
%% Cell type:code id: tags:
``` python
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
weather_data
```
%%%% Output: execute_result
MESS_DATUM Temperatur Relative Feuchte Niederschlagsmenge \
0 2020-04-01 00:00:00 -0.6 57.0 0.0
1 2020-04-01 01:00:00 -1.0 58.0 0.0
2 2020-04-01 02:00:00 0.5 54.0 0.0
3 2020-04-01 03:00:00 -1.1 62.0 0.0
4 2020-04-01 04:00:00 -0.1 58.0 0.0
... ... ... ... ...
10219 2021-05-31 19:00:00 19.0 40.0 0.0
10220 2021-05-31 20:00:00 16.2 49.0 0.0
10221 2021-05-31 21:00:00 13.4 60.0 0.0
10222 2021-05-31 22:00:00 12.2 66.0 0.0
10223 2021-05-31 23:00:00 12.3 68.0 0.0
Feiertag Wochentag Frei[1/0]
0 0 Mittwoch 0
1 0 Mittwoch 0
2 0 Mittwoch 0
3 0 Mittwoch 0
4 0 Mittwoch 0
... ... ... ...
10219 0 Montag 0
10220 0 Montag 0
10221 0 Montag 0
10222 0 Montag 0
10223 0 Montag 0
[10224 rows x 7 columns]
%% Cell type:code id: tags:
``` python
def get_collection_data(data, collections):
collections_data = []
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Weather Data
weather_data_since_last_collection = weather_data[(weather_data['MESS_DATUM'] <= collection_time) & (weather_data['MESS_DATUM'] >= last_collection_time)]
holiday_count = len(weather_data_since_last_collection[weather_data_since_last_collection['Frei[1/0]']==1].index)/len(weather_data_since_last_collection.index)
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference.days
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['sensor_mean_temperature'] = data_since_last_collection['Temperature (C)'].mean()
collection_data['sensor_max_temperature'] = data_since_last_collection['Temperature (C)'].max()
collection_data['sensor_min_temperature'] = data_since_last_collection['Temperature (C)'].min()
collection_data['weather_mean_temperature'] = weather_data_since_last_collection['Temperatur'].mean()
collection_data['weather_max_temperature'] = weather_data_since_last_collection['Temperatur'].max()
collection_data['weather_min_temperature'] = weather_data_since_last_collection['Temperatur'].min()
collection_data['weather_mean_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].mean()
collection_data['weather_max_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].max()
collection_data['weather_min_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].min()
collection_data['weather_mean_moisture'] = weather_data_since_last_collection['Relative Feuchte'].mean()
collection_data['weather_max_moisture'] = weather_data_since_last_collection['Relative Feuchte'].max()
collection_data['weather_min_moisture'] = weather_data_since_last_collection['Relative Feuchte'].min()
collection_data['holiday_percentage'] = holiday_count
collection_data['year'] = collection_time.year
collection_data['month'] = collection_time.month
collection_data['weekday'] = collection_time.weekday()
collections_data.append(collection_data)
return collections_data
```
%% Cell type:code id: tags:
``` python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df = pd.DataFrame()
for file in data_name:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
#write_preprocessed_data(container_id, preprocessed)
```
%% Cell type:code id: tags:
``` python
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=write_to_path+'/collection_data.txt')
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
import os
path_collection_data = '..\data\preprocessed\collection_data.txt'
df = pd.read_csv(path_collection_data)
```
%% Cell type:code id: tags:
``` python
df.head()
```
%%%% Output: execute_result
Unnamed: 0 timestamp container_id last_collection \
0 0 2020-05-22 18:51:01.742945 70B3D500700016DA -14
1 1 2020-06-05 14:49:42.681218 70B3D500700016DA -14
2 2 2020-06-29 13:47:52.050553 70B3D500700016DA -24
3 3 2020-07-17 13:46:18.287249 70B3D500700016DA -18
4 4 2020-08-07 09:44:36.149679 70B3D500700016DA -21
pre_height post_height sensor_mean_temperature sensor_max_temperature \
0 4 124 15.251029 47
1 20 126 16.410714 44
2 4 126 18.255446 43
3 12 128 19.053476 45
4 22 126 21.981524 47
sensor_min_temperature weather_mean_temperature ... weather_mean_rain \
0 0 14.283636 ... 0.037879
1 4 16.873193 ... 0.065964
2 4 18.670261 ... 0.043130
3 7 19.258796 ... 0.029167
4 6 21.973000 ... 0.020600
weather_max_rain weather_min_rain weather_mean_moisture \
0 1.8 0.0 58.121212
1 7.8 0.0 53.888554
2 10.2 0.0 65.890435
3 1.3 0.0 58.773148
4 4.2 0.0 49.794000
weather_max_moisture weather_min_moisture holiday_percentage year \
0 95.0 25.0 0.360606 2020
1 93.0 19.0 0.361446 2020
2 97.0 25.0 0.375652 2020
3 96.0 22.0 0.222222 2020
4 95.0 20.0 0.288000 2020
month weekday
0 5 4
1 6 4
2 6 0
3 7 4
4 8 4
[5 rows x 22 columns]
%% Cell type:code id: tags:
``` python
df.info()
```
%%%% Output: stream
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4118 entries, 0 to 4117
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 4118 non-null int64
1 timestamp 4118 non-null object
2 container_id 4118 non-null object
3 last_collection 4118 non-null object
4 pre_height 4118 non-null int64
5 post_height 4118 non-null int64
6 sensor_mean_temperature 4118 non-null float64
7 sensor_max_temperature 4118 non-null int64
8 sensor_min_temperature 4118 non-null int64
9 weather_mean_temperature 4118 non-null float64
10 weather_max_temperature 4118 non-null float64
11 weather_min_temperature 4118 non-null float64
12 weather_mean_rain 4118 non-null float64
13 weather_max_rain 4118 non-null float64
14 weather_min_rain 4118 non-null float64
15 weather_mean_moisture 4118 non-null float64
16 weather_max_moisture 4118 non-null float64
17 weather_min_moisture 4118 non-null float64
18 holiday_percentage 4118 non-null float64
19 year 4118 non-null int64
20 month 4118 non-null int64
21 weekday 4118 non-null int64
dtypes: float64(11), int64(8), object(3)
memory usage: 707.9+ KB
%% Cell type:code id: tags:
``` python
>>> from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
X = df[['last_collection',
'sensor_mean_temperature',
'sensor_max_temperature',
'sensor_min_temperature',
'weather_mean_temperature',
'weather_max_temperature',
'weather_min_temperature',
'weather_mean_rain',
'weather_max_rain',
'weather_min_rain',
'weather_mean_moisture',
'weather_max_moisture',
'weather_min_moisture',
'holiday_percentage',
'year',
'month',
'weekday']]
y = df['pre_height']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
regr = make_pipeline(StandardScaler(), SVR())
regr.fit(X_train, y_train)
pred = regr.predict(X_test)
```
%% Cell type:code id: tags:
``` python
X_train['last_collection'].shape
```
%%%% Output: execute_result
(2759,)
%% Cell type:code id: tags:
``` python
import matplotlib.pyplot as plt
# Plot outputs
plt.scatter(X_test['last_collection'], y_test, color='black')
plt.plot(X_test['last_collection'], pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
```
%%%% Output: display_data
![]()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment