Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ufesk
bda-analytics-challenge-template
Commits
f3dbf29e
Commit
f3dbf29e
authored
Jul 13, 2021
by
uouzl
Browse files
Upload New File
parent
b8cfa6d2
Changes
1
Hide whitespace changes
Inline
Side-by-side
notebooks/1. Preprocessing/Collection_Detection.ipynb
0 → 100644
View file @
f3dbf29e
%% Cell type:code id: tags:
```
python
import
pandas
as
pd
import
numpy
as
np
from
pandas.io.json
import
json_normalize
import
datetime
import
os
```
%% Cell type:code id: tags:
```
python
# File Path of raw data
path_data
=
r
'..\..\data'
path_raw
=
r
'..\..\data\raw'
path_preprocessed
=
r
'..\..\data\preprocessed'
path_additional
=
r
'..\..\data\additional data'
```
%% Cell type:code id: tags:
```
python
# Loads the raw json file for a specific container
def
load_data
(
container
):
file_number
=
path_raw
+
'
\\
'
+
container
+
'.txt'
df
=
pd
.
read_json
(
file_number
,
lines
=
True
)
raw_data
=
pd
.
DataFrame
(
df
[
1
][
0
])
return
raw_data
```
%% Cell type:code id: tags:
```
python
# Reverses the dataframe passed into the function
# The loaded raw data is in wrong historical order (Youngest date is the first entry)
def
reverse_data
(
raw_data
):
reversed_data
=
raw_data
.
loc
[::
-
1
].
reset_index
(
drop
=
True
)
return
reversed_data
```
%% Cell type:code id: tags:
```
python
# Decodes the json data in the 'decoded_data' column of the dataframe
# Also removes letters from the cloumns listed below and transfroms then into Integers
def
dec_data
(
data
):
df_decoded_data
=
pd
.
json_normalize
(
data
.
decoded_data
)
data
[
"time"
]
=
pd
.
to_datetime
(
list
(
map
(
lambda
st
:
str
(
st
)[
0
:
19
],
raw_data
[
"created_at"
])))
data
[
"Height (cm)"
]
=
140
-
df_decoded_data
[
"sensor_data.Height 1"
].
str
.
replace
(
"cm"
,
""
).
astype
(
int
)
data
[
"Voltage (mV)"
]
=
df_decoded_data
[
"sensor_data.Voltage"
].
str
.
replace
(
"mV"
,
""
).
astype
(
int
)
data
[
"Temperature (C)"
]
=
df_decoded_data
[
"sensor_data.Temperature"
].
str
.
replace
(
"C"
,
""
).
astype
(
int
)
data
[
"Tilt (Degree)"
]
=
df_decoded_data
[
"sensor_data.Tilt"
].
str
.
replace
(
"Degree"
,
""
).
astype
(
int
)
return
data
```
%% Cell type:code id: tags:
```
python
# Removes outliers from the data
# TODO: Right now only temperature outliers are removed
# TODO: Smoothing
def
remove_outliers
(
data
):
for
idx
,
row
in
data
[
data
[
'Temperature (C)'
]
>=
150
].
iterrows
():
if
idx
!=
0
:
new_value
=
data
.
iloc
[
idx
-
1
][
'Temperature (C)'
]
else
:
new_value
=
data
.
iloc
[
idx
+
1
][
'Temperature (C)'
]
data
.
at
[
idx
,
'Temperature (C)'
]
=
new_value
for
idx
,
row
in
data
[(
data
[
'Height (cm)'
]
<
0
)].
iterrows
():
data
.
at
[
idx
,
'Height (cm)'
]
=
0
return
data
```
%% Cell type:code id: tags:
```
python
# Writes the dataframe into a .csv file
# Container specifies the name of the file
def
write_preprocessed_data
(
container
,
data
):
data
.
to_csv
(
path_or_buf
=
path_preprocessed
+
'/'
+
container
+
'.txt'
)
```
%% Cell type:code id: tags:
```
python
# Detects trash collections when sudden changes in Heigth occur
# TODO: improve algorith
# TODO: test on multiple container files
def
detect_collection
(
data
):
last_value
=
None
collections
=
[]
limit
=
0.25
for
idx
,
row
in
data
.
iterrows
():
if
last_value
!=
None
:
height
=
row
[
'Height (cm)'
]
created_at
=
row
[
'unix_time'
]
difference
=
height
-
last_value
if
difference
>
last_value
*
limit
and
height
>
110
:
collections
.
append
(
idx
)
last_value
=
row
[
'Height (cm)'
]
return
collections
```
%% Cell type:code id: tags:
```
python
def
get_collection_data
(
data
,
collections
):
collections_data
=
[]
weather_data
=
pd
.
read_excel
(
path_additional
+
'
\\
additional_data.xlsx'
)
for
collection
in
collections
:
idx
=
collections
.
index
(
collection
)
row
=
data
.
iloc
[
collection
]
collection_data
=
{}
last_collection
=
0
if
idx
!=
0
:
last_collection
=
collections
[
idx
-
1
]
# Get data since last collection and data of last collection
data_since_last_collection
=
data
.
iloc
[
last_collection
:
collection
]
data_last_collection
=
data
.
iloc
[
last_collection
]
data_last_measurement
=
data
.
iloc
[
collection
-
1
]
# Calculate time difference
collection_time
=
datetime
.
datetime
.
fromtimestamp
(
int
(
row
[
'unix_time'
])
/
1000
)
last_collection_time
=
datetime
.
datetime
.
fromtimestamp
(
int
(
data_last_collection
[
'unix_time'
])
/
1000
)
time_difference
=
last_collection_time
-
collection_time
# Weather Data
weather_data_since_last_collection
=
weather_data
[(
weather_data
[
'MESS_DATUM'
]
<=
collection_time
)
&
(
weather_data
[
'MESS_DATUM'
]
>=
last_collection_time
)]
holiday_count
=
len
(
weather_data_since_last_collection
[
weather_data_since_last_collection
[
'Frei[1/0]'
]
==
1
].
index
)
/
len
(
weather_data_since_last_collection
.
index
)
# Create collection data entry
collection_data
[
'timestamp'
]
=
row
[
'created_at'
]
collection_data
[
'container_id'
]
=
row
[
'deveui'
]
collection_data
[
'last_collection'
]
=
time_difference
.
days
collection_data
[
'pre_height'
]
=
data_last_measurement
[
'Height (cm)'
]
collection_data
[
'post_height'
]
=
row
[
'Height (cm)'
]
collection_data
[
'sensor_mean_temperature'
]
=
data_since_last_collection
[
'Temperature (C)'
].
mean
()
collection_data
[
'sensor_max_temperature'
]
=
data_since_last_collection
[
'Temperature (C)'
].
max
()
collection_data
[
'sensor_min_temperature'
]
=
data_since_last_collection
[
'Temperature (C)'
].
min
()
collection_data
[
'weather_mean_temperature'
]
=
weather_data_since_last_collection
[
'Temperatur'
].
mean
()
collection_data
[
'weather_max_temperature'
]
=
weather_data_since_last_collection
[
'Temperatur'
].
max
()
collection_data
[
'weather_min_temperature'
]
=
weather_data_since_last_collection
[
'Temperatur'
].
min
()
collection_data
[
'weather_mean_rain'
]
=
weather_data_since_last_collection
[
'Niederschlagsmenge'
].
mean
()
collection_data
[
'weather_max_rain'
]
=
weather_data_since_last_collection
[
'Niederschlagsmenge'
].
max
()
collection_data
[
'weather_min_rain'
]
=
weather_data_since_last_collection
[
'Niederschlagsmenge'
].
min
()
collection_data
[
'weather_mean_moisture'
]
=
weather_data_since_last_collection
[
'Relative Feuchte'
].
mean
()
collection_data
[
'weather_max_moisture'
]
=
weather_data_since_last_collection
[
'Relative Feuchte'
].
max
()
collection_data
[
'weather_min_moisture'
]
=
weather_data_since_last_collection
[
'Relative Feuchte'
].
min
()
collection_data
[
'holiday_percentage'
]
=
holiday_count
collection_data
[
'year'
]
=
collection_time
.
year
collection_data
[
'month'
]
=
collection_time
.
month
collection_data
[
'weekday'
]
=
collection_time
.
weekday
()
collections_data
.
append
(
collection_data
)
return
collections_data
```
%% Cell type:code id: tags:
```
python
# Each raw data file gets processed and the written into a new file in data/preprocessed
collection_df
=
pd
.
DataFrame
()
containers
=
os
.
listdir
(
path_raw
)
for
file
in
containers
:
container_id
=
file
.
replace
(
'.txt'
,
''
)
raw_data
=
load_data
(
container_id
)
reversed_data
=
reverse_data
(
raw_data
)
decoded_data
=
dec_data
(
reversed_data
)
preprocessed
=
remove_outliers
(
decoded_data
)
collections
=
detect_collection
(
decoded_data
)
collections_data
=
get_collection_data
(
decoded_data
,
collections
)
collection_df
=
collection_df
.
append
(
collections_data
,
ignore_index
=
True
,
sort
=
False
)
write_preprocessed_data
(
container_id
,
preprocessed
)
```
%% Cell type:code id: tags:
```
python
# Write all available colletion data into one file in data/preprocessed
collection_df
.
to_csv
(
path_or_buf
=
path_data
+
'/collection_data.txt'
)
```
%% Cell type:code id: tags:
```
python
```
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment