Commit 90d06782 authored by tills's avatar tills
Browse files

Clean up

parent 6df313ae
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 4
}
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
delete me, just to keep the folder in git
\ No newline at end of file
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import os
```
%% Cell type:code id: tags:
``` python
# File Path of raw data
file_data=r'..\data\raw'
data_name=os.listdir(file_data)
write_to_path = r'..\data\preprocessed'
```
%% Cell type:code id: tags:
``` python
# Loads the raw json file for a specific container
def load_data(container):
file_number=file_data+'\\'+container+'.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
```
%% Cell type:code id: tags:
``` python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
```
%% Cell type:code id: tags:
``` python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 1 (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 2 (cm)"] = df_decoded_data["sensor_data.Height 2"].str.replace("cm","").astype(int)
data["Height 3 (cm)"] = df_decoded_data["sensor_data.Height 3"].str.replace("cm","").astype(int)
data["Height 4 (cm)"] = df_decoded_data["sensor_data.Height 4"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
```
%% Cell type:code id: tags:
``` python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
return data
```
%% Cell type:code id: tags:
``` python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=write_to_path+'/'+container+'.txt')
```
%% Cell type:code id: tags:
``` python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
created_at = row['unix_time']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
```
%% Cell type:code id: tags:
``` python
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
weather_data
```
%%%% Output: execute_result
MESS_DATUM Temperatur Relative Feuchte Niederschlagsmenge \
0 2020-04-01 00:00:00 -0.6 57.0 0.0
1 2020-04-01 01:00:00 -1.0 58.0 0.0
2 2020-04-01 02:00:00 0.5 54.0 0.0
3 2020-04-01 03:00:00 -1.1 62.0 0.0
4 2020-04-01 04:00:00 -0.1 58.0 0.0
... ... ... ... ...
10219 2021-05-31 19:00:00 19.0 40.0 0.0
10220 2021-05-31 20:00:00 16.2 49.0 0.0
10221 2021-05-31 21:00:00 13.4 60.0 0.0
10222 2021-05-31 22:00:00 12.2 66.0 0.0
10223 2021-05-31 23:00:00 12.3 68.0 0.0
Feiertag Wochentag Frei[1/0]
0 0 Mittwoch 0
1 0 Mittwoch 0
2 0 Mittwoch 0
3 0 Mittwoch 0
4 0 Mittwoch 0
... ... ... ...
10219 0 Montag 0
10220 0 Montag 0
10221 0 Montag 0
10222 0 Montag 0
10223 0 Montag 0
[10224 rows x 7 columns]
%% Cell type:code id: tags:
``` python
def get_collection_data(data, collections):
collections_data = []
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Weather Data
weather_data_since_last_collection = weather_data[(weather_data['MESS_DATUM'] <= collection_time) & (weather_data['MESS_DATUM'] >= last_collection_time)]
holiday_count = len(weather_data_since_last_collection[weather_data_since_last_collection['Frei[1/0]']==1].index)/len(weather_data_since_last_collection.index)
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference.days
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['sensor_mean_temperature'] = data_since_last_collection['Temperature (C)'].mean()
collection_data['sensor_max_temperature'] = data_since_last_collection['Temperature (C)'].max()
collection_data['sensor_min_temperature'] = data_since_last_collection['Temperature (C)'].min()
collection_data['weather_mean_temperature'] = weather_data_since_last_collection['Temperatur'].mean()
collection_data['weather_max_temperature'] = weather_data_since_last_collection['Temperatur'].max()
collection_data['weather_min_temperature'] = weather_data_since_last_collection['Temperatur'].min()
collection_data['weather_mean_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].mean()
collection_data['weather_max_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].max()
collection_data['weather_min_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].min()
collection_data['weather_mean_moisture'] = weather_data_since_last_collection['Relative Feuchte'].mean()
collection_data['weather_max_moisture'] = weather_data_since_last_collection['Relative Feuchte'].max()
collection_data['weather_min_moisture'] = weather_data_since_last_collection['Relative Feuchte'].min()
collection_data['holiday_percentage'] = holiday_count
collection_data['year'] = collection_time.year
collection_data['month'] = collection_time.month
collection_data['weekday'] = collection_time.weekday()
collections_data.append(collection_data)
return collections_data
```
%% Cell type:code id: tags:
``` python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df = pd.DataFrame()
for file in data_name:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
#write_preprocessed_data(container_id, preprocessed)
```
%% Cell type:code id: tags:
``` python
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=write_to_path+'/collection_data.txt')
```
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment