Commit b6f3bbea authored by uouzl's avatar uouzl
Browse files

Delete Collection Detection.ipynb

parent f3dbf29e
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import os
```
%% Cell type:code id: tags:
``` python
# File Path of raw data
file_data=r'..\data\raw'
data_name=os.listdir(file_data)
write_to_path = r'..\data\preprocessed'
```
%% Cell type:code id: tags:
``` python
# Loads the raw json file for a specific container
def load_data(container):
file_number=file_data+'\\'+container+'.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
```
%% Cell type:code id: tags:
``` python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
```
%% Cell type:code id: tags:
``` python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 1 (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 2 (cm)"] = df_decoded_data["sensor_data.Height 2"].str.replace("cm","").astype(int)
data["Height 3 (cm)"] = df_decoded_data["sensor_data.Height 3"].str.replace("cm","").astype(int)
data["Height 4 (cm)"] = df_decoded_data["sensor_data.Height 4"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
```
%% Cell type:code id: tags:
``` python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
return data
```
%% Cell type:code id: tags:
``` python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=write_to_path+'/'+container+'.txt')
```
%% Cell type:code id: tags:
``` python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
created_at = row['unix_time']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
```
%% Cell type:code id: tags:
``` python
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
weather_data
```
%%%% Output: execute_result
MESS_DATUM Temperatur Relative Feuchte Niederschlagsmenge \
0 2020-04-01 00:00:00 -0.6 57.0 0.0
1 2020-04-01 01:00:00 -1.0 58.0 0.0
2 2020-04-01 02:00:00 0.5 54.0 0.0
3 2020-04-01 03:00:00 -1.1 62.0 0.0
4 2020-04-01 04:00:00 -0.1 58.0 0.0
... ... ... ... ...
10219 2021-05-31 19:00:00 19.0 40.0 0.0
10220 2021-05-31 20:00:00 16.2 49.0 0.0
10221 2021-05-31 21:00:00 13.4 60.0 0.0
10222 2021-05-31 22:00:00 12.2 66.0 0.0
10223 2021-05-31 23:00:00 12.3 68.0 0.0
Feiertag Wochentag Frei[1/0]
0 0 Mittwoch 0
1 0 Mittwoch 0
2 0 Mittwoch 0
3 0 Mittwoch 0
4 0 Mittwoch 0
... ... ... ...
10219 0 Montag 0
10220 0 Montag 0
10221 0 Montag 0
10222 0 Montag 0
10223 0 Montag 0
[10224 rows x 7 columns]
%% Cell type:code id: tags:
``` python
def get_collection_data(data, collections):
collections_data = []
weather_data = pd.read_excel(r'..\data\additional data\additional_data.xlsx')
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Weather Data
weather_data_since_last_collection = weather_data[(weather_data['MESS_DATUM'] <= collection_time) & (weather_data['MESS_DATUM'] >= last_collection_time)]
holiday_count = len(weather_data_since_last_collection[weather_data_since_last_collection['Frei[1/0]']==1].index)/len(weather_data_since_last_collection.index)
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference.days
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['sensor_mean_temperature'] = data_since_last_collection['Temperature (C)'].mean()
collection_data['sensor_max_temperature'] = data_since_last_collection['Temperature (C)'].max()
collection_data['sensor_min_temperature'] = data_since_last_collection['Temperature (C)'].min()
collection_data['weather_mean_temperature'] = weather_data_since_last_collection['Temperatur'].mean()
collection_data['weather_max_temperature'] = weather_data_since_last_collection['Temperatur'].max()
collection_data['weather_min_temperature'] = weather_data_since_last_collection['Temperatur'].min()
collection_data['weather_mean_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].mean()
collection_data['weather_max_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].max()
collection_data['weather_min_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].min()
collection_data['weather_mean_moisture'] = weather_data_since_last_collection['Relative Feuchte'].mean()
collection_data['weather_max_moisture'] = weather_data_since_last_collection['Relative Feuchte'].max()
collection_data['weather_min_moisture'] = weather_data_since_last_collection['Relative Feuchte'].min()
collection_data['holiday_percentage'] = holiday_count
collection_data['year'] = collection_time.year
collection_data['month'] = collection_time.month
collection_data['weekday'] = collection_time.weekday()
collections_data.append(collection_data)
return collections_data
```
%% Cell type:code id: tags:
``` python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df = pd.DataFrame()
for file in data_name:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
#write_preprocessed_data(container_id, preprocessed)
```
%% Cell type:code id: tags:
``` python
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=write_to_path+'/collection_data.txt')
```
%% Cell type:code id: tags:
``` python
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment