Commit 90d06782 authored by tills's avatar tills
Browse files

Clean up

parent 6df313ae
This diff is collapsed.
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import os
```
%% Cell type:code id: tags:
``` python
# File Path of raw data
file_data=r'..\data\raw'
data_name=os.listdir(file_data)
write_to_path = r'..\data\preprocessed'
```
%% Cell type:code id: tags:
``` python
# Loads the raw json file for a specific container
def load_data(container):
file_number=file_data+'\\'+container+'.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
```
%% Cell type:code id: tags:
``` python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
```
%% Cell type:code id: tags:
``` python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 1 (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 2 (cm)"] = df_decoded_data["sensor_data.Height 2"].str.replace("cm","").astype(int)
data["Height 3 (cm)"] = df_decoded_data["sensor_data.Height 3"].str.replace("cm","").astype(int)
data["Height 4 (cm)"] = df_decoded_data["sensor_data.Height 4"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
```
%% Cell type:code id: tags:
``` python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
return data
```
%% Cell type:code id: tags:
``` python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=write_to_path+'/'+container+'.txt')
```
%% Cell type:code id: tags:
``` python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
created_at = row['unix_time']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
```
%% Cell type:code id: tags:
``` python
# For each trash collection detected it summarizes the data since the last collection
# Data: timestamp of collection, container ID, time since last collection, Height before and after collection, mean, max and min Temperature
# TODO: More data?
def get_collection_data(data, collections):
collections_data = []
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Calulacte values since last collection
temperature_mean = data_since_last_collection['Temperature (C)'].mean()
temperature_max = data_since_last_collection['Temperature (C)'].max()
temperature_min = data_since_last_collection['Temperature (C)'].min()
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['mean_temperature'] = temperature_mean
collection_data['max_temperature'] = temperature_max
collection_data['min_temperature'] = temperature_min
collections_data.append(collection_data)
return collections_data
```
%% Cell type:code id: tags:
``` python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df = pd.DataFrame()
for file in data_name:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
write_preprocessed_data(container_id, preprocessed)
```
%% Cell type:code id: tags:
``` python
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=write_to_path+'/collection_data.txt')
```
%% Cell type:code id: tags:
``` python
```
delete me, just to keep the folder in git
\ No newline at end of file
import pandas as pd
import numpy as np
import os
path_data = r'..\data'
path_collection_data = path_data + '\collection_data.txt'
path_train_data = r'..\data\modeling\train'
T_collection_intervall=75
T_number_collections=40
df = pd.read_csv(file_number)
# Calculate Collection Intervall per container
df["collection_intervall"] = list(map(lambda st: str(st)[0:int(st.index("d"))],df["last_collection"]))
df["collection_intervall"]=df["collection_intervall"].astype(int)
df["collection_intervall"]= list(map(lambda z: z*(-1),df["collection_intervall"]))
# Calculate Number of Collections per container
df["number_collections"]=np.ones(len(df["collection_intervall"]))
df["number_collections"]=df["number_collections"].astype(int)
number_collections=pd.DataFrame({'container_id':df["container_id"],'number_collections':df['number_collections']})
number_collections=number_collections.groupby(['container_id']).sum()
relevant_data=pd.DataFrame({'container_id': df["container_id"],
'collection_intervall': df["collection_intervall"],
'pre_height': df["pre_height"],
'post_height': df["post_height"],
'sensor_mean_temperature': df["sensor_mean_temperature"],
'lockdown': df['Lockdown']
})
data_all = relevant_data.groupby(['container_id']).mean()
data_all['number_collections'] = number_collections['number_collections']
data_all=data_all[data_all['collection_intervall']<=T_collection_intervall]
data_all=data_all[data_all['number_collections']<=T_number_collections]
data_all=data_all.reset_index()
container=list(data_all['container_id'])
train_data=df[df['container_id']==container[0]]
for item in container:
if item == container[0]:
None
else:
train_data=train_data.append(df[df['container_id']==item])
train_data.to_csv(path_or_buf=path_train_data + '/train_data.txt')
\ No newline at end of file
import pandas as pd
from pandas.io.json import json_normalize
import datetime
import os
path_data = r'..\data'
path_raw = r'..\data\raw'
path_preprocessed = r'..\data\preprocessed'
path_additional = r'..\data\additional'
# Loads the raw json file for a specific container
def load_data(container):
file_number = path_raw + '\\' + container + '.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 1 (cm)"] = df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Height 2 (cm)"] = df_decoded_data["sensor_data.Height 2"].str.replace("cm","").astype(int)
data["Height 3 (cm)"] = df_decoded_data["sensor_data.Height 3"].str.replace("cm","").astype(int)
data["Height 4 (cm)"] = df_decoded_data["sensor_data.Height 4"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
# Removes outliers from the data
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
for idx, row in data[data['Height (cm)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Height (cm)']
else:
new_value = data.iloc[idx+1]['Height (cm)']
data.at[idx, 'Height (cm)'] = new_value
return data
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=path_preprocessed + '/' + container + '.txt')
# Detects trash collections when sudden changes in Heigth occur
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
# For each trash collection detected it summarizes the data since the last collection
# Data: timestamp of collection, container ID, time since last collection, Height before and after collection, mean, max and min Temperature
# TODO: More data?
def get_collection_data(data, collections):
collections_data = []
weather_data = pd.read_excel(path_additional+r'\additional_data.xlsx')
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Weather Data
weather_data_since_last_collection = weather_data[(weather_data['MESS_DATUM'] <= collection_time) & (weather_data['MESS_DATUM'] >= last_collection_time)]
holiday_count = len(weather_data_since_last_collection[weather_data_since_last_collection['Frei[1/0]']==1].index)/len(weather_data_since_last_collection.index)
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['sensor_mean_temperature'] = data_since_last_collection['Temperature (C)'].mean()
collection_data['sensor_max_temperature'] = data_since_last_collection['Temperature (C)'].max()
collection_data['sensor_min_temperature'] = data_since_last_collection['Temperature (C)'].min()
collection_data['weather_mean_temperature'] = weather_data_since_last_collection['Temperatur'].mean()
collection_data['weather_max_temperature'] = weather_data_since_last_collection['Temperatur'].max()
collection_data['weather_min_temperature'] = weather_data_since_last_collection['Temperatur'].min()
collection_data['weather_mean_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].mean()
collection_data['weather_max_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].max()
collection_data['weather_min_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].min()
collection_data['weather_mean_moisture'] = weather_data_since_last_collection['Relative Feuchte'].mean()
collection_data['weather_max_moisture'] = weather_data_since_last_collection['Relative Feuchte'].max()
collection_data['weather_min_moisture'] = weather_data_since_last_collection['Relative Feuchte'].min()
collection_data['holiday_percentage'] = holiday_count
collection_data['year'] = collection_time.year
collection_data['month'] = collection_time.month
collection_data['weekday'] = collection_time.weekday()
collections_data.append(collection_data)
return collections_data
# Each raw data file gets processed and the written into a new file in data/preprocessed
containers = os.listdir(path_raw)
collection_df = pd.DataFrame()
for file in containers:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
write_preprocessed_data(container_id, preprocessed)
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=path_data + '/collection_data.txt')
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment