Commit f3dbf29e authored by uouzl's avatar uouzl
Browse files

Upload New File

parent b8cfa6d2
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import datetime
import os
```
%% Cell type:code id: tags:
``` python
# File Path of raw data
path_data = r'..\..\data'
path_raw = r'..\..\data\raw'
path_preprocessed = r'..\..\data\preprocessed'
path_additional = r'..\..\data\additional data'
```
%% Cell type:code id: tags:
``` python
# Loads the raw json file for a specific container
def load_data(container):
file_number=path_raw+'\\'+container+'.txt'
df = pd.read_json(file_number, lines=True)
raw_data = pd.DataFrame(df[1][0])
return raw_data
```
%% Cell type:code id: tags:
``` python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def reverse_data(raw_data):
reversed_data = raw_data.loc[::-1].reset_index(drop = True)
return reversed_data
```
%% Cell type:code id: tags:
``` python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def dec_data(data):
df_decoded_data = pd.json_normalize(data.decoded_data)
data["time"]=pd.to_datetime(list(map(lambda st: str(st)[0:19],raw_data["created_at"])))
data["Height (cm)"] = 140 - df_decoded_data["sensor_data.Height 1"].str.replace("cm","").astype(int)
data["Voltage (mV)"] = df_decoded_data["sensor_data.Voltage"].str.replace("mV","").astype(int)
data["Temperature (C)"] = df_decoded_data["sensor_data.Temperature"].str.replace("C","").astype(int)
data["Tilt (Degree)"] = df_decoded_data["sensor_data.Tilt"].str.replace("Degree","").astype(int)
return data
```
%% Cell type:code id: tags:
``` python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def remove_outliers(data):
for idx, row in data[data['Temperature (C)'] >= 150].iterrows():
if idx != 0:
new_value = data.iloc[idx-1]['Temperature (C)']
else:
new_value = data.iloc[idx+1]['Temperature (C)']
data.at[idx, 'Temperature (C)'] = new_value
for idx, row in data[(data['Height (cm)'] < 0)].iterrows():
data.at[idx, 'Height (cm)'] = 0
return data
```
%% Cell type:code id: tags:
``` python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def write_preprocessed_data(container, data):
data.to_csv(path_or_buf=path_preprocessed+'/'+container+'.txt')
```
%% Cell type:code id: tags:
``` python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def detect_collection(data):
last_value= None
collections = []
limit = 0.25
for idx, row in data.iterrows():
if last_value != None:
height = row['Height (cm)']
created_at = row['unix_time']
difference = height - last_value
if difference > last_value*limit and height > 110:
collections.append(idx)
last_value = row['Height (cm)']
return collections
```
%% Cell type:code id: tags:
``` python
def get_collection_data(data, collections):
collections_data = []
weather_data = pd.read_excel(path_additional + '\\additional_data.xlsx')
for collection in collections:
idx = collections.index(collection)
row = data.iloc[collection]
collection_data = {}
last_collection = 0
if idx != 0:
last_collection = collections[idx-1]
# Get data since last collection and data of last collection
data_since_last_collection = data.iloc[last_collection:collection]
data_last_collection = data.iloc[last_collection]
data_last_measurement = data.iloc[collection-1]
# Calculate time difference
collection_time = datetime.datetime.fromtimestamp(int(row['unix_time'])/1000)
last_collection_time = datetime.datetime.fromtimestamp(int(data_last_collection['unix_time'])/1000)
time_difference = last_collection_time-collection_time
# Weather Data
weather_data_since_last_collection = weather_data[(weather_data['MESS_DATUM'] <= collection_time) & (weather_data['MESS_DATUM'] >= last_collection_time)]
holiday_count = len(weather_data_since_last_collection[weather_data_since_last_collection['Frei[1/0]']==1].index)/len(weather_data_since_last_collection.index)
# Create collection data entry
collection_data['timestamp'] = row['created_at']
collection_data['container_id'] = row['deveui']
collection_data['last_collection'] = time_difference.days
collection_data['pre_height'] = data_last_measurement['Height (cm)']
collection_data['post_height'] = row['Height (cm)']
collection_data['sensor_mean_temperature'] = data_since_last_collection['Temperature (C)'].mean()
collection_data['sensor_max_temperature'] = data_since_last_collection['Temperature (C)'].max()
collection_data['sensor_min_temperature'] = data_since_last_collection['Temperature (C)'].min()
collection_data['weather_mean_temperature'] = weather_data_since_last_collection['Temperatur'].mean()
collection_data['weather_max_temperature'] = weather_data_since_last_collection['Temperatur'].max()
collection_data['weather_min_temperature'] = weather_data_since_last_collection['Temperatur'].min()
collection_data['weather_mean_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].mean()
collection_data['weather_max_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].max()
collection_data['weather_min_rain'] = weather_data_since_last_collection['Niederschlagsmenge'].min()
collection_data['weather_mean_moisture'] = weather_data_since_last_collection['Relative Feuchte'].mean()
collection_data['weather_max_moisture'] = weather_data_since_last_collection['Relative Feuchte'].max()
collection_data['weather_min_moisture'] = weather_data_since_last_collection['Relative Feuchte'].min()
collection_data['holiday_percentage'] = holiday_count
collection_data['year'] = collection_time.year
collection_data['month'] = collection_time.month
collection_data['weekday'] = collection_time.weekday()
collections_data.append(collection_data)
return collections_data
```
%% Cell type:code id: tags:
``` python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df = pd.DataFrame()
containers = os.listdir(path_raw)
for file in containers:
container_id = file.replace('.txt', '')
raw_data = load_data(container_id)
reversed_data = reverse_data(raw_data)
decoded_data = dec_data(reversed_data)
preprocessed = remove_outliers(decoded_data)
collections = detect_collection(decoded_data)
collections_data = get_collection_data(decoded_data, collections)
collection_df = collection_df.append(collections_data, ignore_index=True, sort=False)
write_preprocessed_data(container_id, preprocessed)
```
%% Cell type:code id: tags:
``` python
# Write all available colletion data into one file in data/preprocessed
collection_df.to_csv(path_or_buf=path_data + '/collection_data.txt')
```
%% Cell type:code id: tags:
``` python
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment