{ "cells": [ { "cell_type": "code", "execution_count": 10, "id": "1a7e310f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import os" ] }, { "cell_type": "code", "execution_count": 23, "id": "bddc2445", "metadata": {}, "outputs": [], "source": [ "data_path = r'..\\..\\data'\n", "data='collection_data.txt'\n", "file_number=data_path+'\\\\'+data \n", "df = pd.read_csv(file_number)\n", "df[\"collection_intervall\"]= list(map(lambda z: z*(-1),df[\"last_collection\"]))\n", "df[\"number_collections\"]=np.ones(len(df[\"last_collection\"]))\n", "df[\"number_collections\"]=df[\"number_collections\"].astype(int)" ] }, { "cell_type": "code", "execution_count": 24, "id": "4d9c368e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0timestampcontainer_idlast_collectionpre_heightpost_heightsensor_mean_temperaturesensor_max_temperaturesensor_min_temperatureweather_mean_temperature...weather_mean_moistureweather_max_moistureweather_min_moistureholiday_percentageLockdownyearmonthweekdaycollection_intervallnumber_collections
002020-05-22 18:51:01.74294570B3D500700016DA-141361615.25102947014.283636...58.12121295.025.00.3606060.0202054141
112020-06-05 14:49:42.68121870B3D500700016DA-141201416.41071444416.873193...53.88855493.019.00.3614460.0202064141
222020-06-29 13:47:52.05055370B3D500700016DA-241361418.25544643418.670261...65.89043597.025.00.3756520.0202060241
332020-07-17 13:46:18.28724970B3D500700016DA-181281219.05347645719.258796...58.77314896.022.00.2222220.0202074181
442020-08-07 09:44:36.14967970B3D500700016DA-211181421.98152447621.973000...49.79400095.020.00.2880000.0202084211
..................................................................
411341132021-04-27 20:26:56.51151970B3D50070001789-2522819.600000331011.096000...37.20000051.024.00.0000001.020214121
411441142021-05-05 15:26:20.8263470B3D50070001789-870417.64655232109.835294...61.19786198.025.00.2566841.020215281
411541152021-05-06 16:26:17.37524670B3D50070001789-238615.05555623107.068000...75.32000092.051.00.0000001.020215321
411641162021-05-07 12:26:16.70911270B3D50070001789-140014.21428619127.010000...82.95000095.053.00.0000001.020215411
411741172021-05-07 21:26:16.9010370B3D50070001789-1441219.50000026149.211111...52.33333376.039.00.0000001.020215411
\n", "

4118 rows × 25 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 timestamp container_id \\\n", "0 0 2020-05-22 18:51:01.742945 70B3D500700016DA \n", "1 1 2020-06-05 14:49:42.681218 70B3D500700016DA \n", "2 2 2020-06-29 13:47:52.050553 70B3D500700016DA \n", "3 3 2020-07-17 13:46:18.287249 70B3D500700016DA \n", "4 4 2020-08-07 09:44:36.149679 70B3D500700016DA \n", "... ... ... ... \n", "4113 4113 2021-04-27 20:26:56.511519 70B3D50070001789 \n", "4114 4114 2021-05-05 15:26:20.82634 70B3D50070001789 \n", "4115 4115 2021-05-06 16:26:17.375246 70B3D50070001789 \n", "4116 4116 2021-05-07 12:26:16.709112 70B3D50070001789 \n", "4117 4117 2021-05-07 21:26:16.90103 70B3D50070001789 \n", "\n", " last_collection pre_height post_height sensor_mean_temperature \\\n", "0 -14 136 16 15.251029 \n", "1 -14 120 14 16.410714 \n", "2 -24 136 14 18.255446 \n", "3 -18 128 12 19.053476 \n", "4 -21 118 14 21.981524 \n", "... ... ... ... ... \n", "4113 -2 52 28 19.600000 \n", "4114 -8 70 4 17.646552 \n", "4115 -2 38 6 15.055556 \n", "4116 -1 40 0 14.214286 \n", "4117 -1 44 12 19.500000 \n", "\n", " sensor_max_temperature sensor_min_temperature \\\n", "0 47 0 \n", "1 44 4 \n", "2 43 4 \n", "3 45 7 \n", "4 47 6 \n", "... ... ... \n", "4113 33 10 \n", "4114 32 10 \n", "4115 23 10 \n", "4116 19 12 \n", "4117 26 14 \n", "\n", " weather_mean_temperature ... weather_mean_moisture \\\n", "0 14.283636 ... 58.121212 \n", "1 16.873193 ... 53.888554 \n", "2 18.670261 ... 65.890435 \n", "3 19.258796 ... 58.773148 \n", "4 21.973000 ... 49.794000 \n", "... ... ... ... \n", "4113 11.096000 ... 37.200000 \n", "4114 9.835294 ... 61.197861 \n", "4115 7.068000 ... 75.320000 \n", "4116 7.010000 ... 82.950000 \n", "4117 9.211111 ... 52.333333 \n", "\n", " weather_max_moisture weather_min_moisture holiday_percentage \\\n", "0 95.0 25.0 0.360606 \n", "1 93.0 19.0 0.361446 \n", "2 97.0 25.0 0.375652 \n", "3 96.0 22.0 0.222222 \n", "4 95.0 20.0 0.288000 \n", "... ... ... ... \n", "4113 51.0 24.0 0.000000 \n", "4114 98.0 25.0 0.256684 \n", "4115 92.0 51.0 0.000000 \n", "4116 95.0 53.0 0.000000 \n", "4117 76.0 39.0 0.000000 \n", "\n", " Lockdown year month weekday collection_intervall number_collections \n", "0 0.0 2020 5 4 14 1 \n", "1 0.0 2020 6 4 14 1 \n", "2 0.0 2020 6 0 24 1 \n", "3 0.0 2020 7 4 18 1 \n", "4 0.0 2020 8 4 21 1 \n", "... ... ... ... ... ... ... \n", "4113 1.0 2021 4 1 2 1 \n", "4114 1.0 2021 5 2 8 1 \n", "4115 1.0 2021 5 3 2 1 \n", "4116 1.0 2021 5 4 1 1 \n", "4117 1.0 2021 5 4 1 1 \n", "\n", "[4118 rows x 25 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "id": "6e02f3a6", "metadata": {}, "source": [ "Hinzufügen der Information der geschätzten Anzahl der Leerungen jedes Conatainers" ] }, { "cell_type": "code", "execution_count": 25, "id": "aab3daa4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
number_collections
container_id
70B3D500700016DA22
70B3D500700016DF44
70B3D500700016E043
70B3D500700016E524
70B3D500700016E6104
......
70B3D50070001782118
70B3D500700017865
70B3D5007000178764
70B3D5007000178845
70B3D50070001789112
\n", "

72 rows × 1 columns

\n", "
" ], "text/plain": [ " number_collections\n", "container_id \n", "70B3D500700016DA 22\n", "70B3D500700016DF 44\n", "70B3D500700016E0 43\n", "70B3D500700016E5 24\n", "70B3D500700016E6 104\n", "... ...\n", "70B3D50070001782 118\n", "70B3D50070001786 5\n", "70B3D50070001787 64\n", "70B3D50070001788 45\n", "70B3D50070001789 112\n", "\n", "[72 rows x 1 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "number_collections=pd.DataFrame({'container_id':df[\"container_id\"],'number_collections':df['number_collections']})\n", "number_collections=number_collections.groupby(['container_id']).sum()\n", "number_collections" ] }, { "cell_type": "code", "execution_count": 26, "id": "6bba9c37", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
collection_intervallpre_heightpost_heightsensor_mean_temperaturelockdownnumber_collections
container_id
70B3D500700016DA-16.818182114.36363611.6363649.2865370.59090922
70B3D500700016DF-7.95454565.13636414.63636411.8001920.68181844
70B3D500700016E0-8.74418649.5813952.51162818.1446720.58139543
70B3D500700016E5-15.54166751.4166675.50000011.3601910.54166724
70B3D500700016E6-3.96153867.71153817.57692311.2708620.653846104
.....................
70B3D50070001782-3.65254266.20339018.05084721.8028900.330508118
70B3D50070001786-26.00000064.00000026.40000025.6624320.0000005
70B3D50070001787-6.10937577.65625012.8750006.4968640.79687564
70B3D50070001788-8.31111167.73333310.88888911.6536650.53333345
70B3D50070001789-3.77678669.14285712.50000018.6587660.446429112
\n", "

72 rows × 6 columns

\n", "
" ], "text/plain": [ " collection_intervall pre_height post_height \\\n", "container_id \n", "70B3D500700016DA -16.818182 114.363636 11.636364 \n", "70B3D500700016DF -7.954545 65.136364 14.636364 \n", "70B3D500700016E0 -8.744186 49.581395 2.511628 \n", "70B3D500700016E5 -15.541667 51.416667 5.500000 \n", "70B3D500700016E6 -3.961538 67.711538 17.576923 \n", "... ... ... ... \n", "70B3D50070001782 -3.652542 66.203390 18.050847 \n", "70B3D50070001786 -26.000000 64.000000 26.400000 \n", "70B3D50070001787 -6.109375 77.656250 12.875000 \n", "70B3D50070001788 -8.311111 67.733333 10.888889 \n", "70B3D50070001789 -3.776786 69.142857 12.500000 \n", "\n", " sensor_mean_temperature lockdown number_collections \n", "container_id \n", "70B3D500700016DA 9.286537 0.590909 22 \n", "70B3D500700016DF 11.800192 0.681818 44 \n", "70B3D500700016E0 18.144672 0.581395 43 \n", "70B3D500700016E5 11.360191 0.541667 24 \n", "70B3D500700016E6 11.270862 0.653846 104 \n", "... ... ... ... \n", "70B3D50070001782 21.802890 0.330508 118 \n", "70B3D50070001786 25.662432 0.000000 5 \n", "70B3D50070001787 6.496864 0.796875 64 \n", "70B3D50070001788 11.653665 0.533333 45 \n", "70B3D50070001789 18.658766 0.446429 112 \n", "\n", "[72 rows x 6 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "relevant_data=pd.DataFrame({'container_id':df[\"container_id\"],'collection_intervall':df[\"last_collection\"],\n", " 'pre_height':df[\"pre_height\"],'post_height':df[\"post_height\"],\n", " 'sensor_mean_temperature':df[\"sensor_mean_temperature\"],'lockdown':df['Lockdown']})\n", "data_all=relevant_data.groupby(['container_id']).mean()\n", "data_all['number_collections']=number_collections['number_collections']\n", "data_all" ] }, { "cell_type": "markdown", "id": "63323896", "metadata": {}, "source": [ "Rausfiltern aller \"Ausreißer Container\", damit Modelling nicht verfälscht wird.
\n", "\"Ausreißer Container\" sind definiert als alle Contaner, die mehr als 40 Leerungen haben oder im Durchschnitt ein Leerungsintervall von über 75 Tage. Schwellwerte wurden anhand der Visualisierung des Clustering Notebooks ausgewählt. " ] }, { "cell_type": "code", "execution_count": 27, "id": "4e74d2b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['70B3D500700016DA',\n", " '70B3D500700016E5',\n", " '70B3D500700016E7',\n", " '70B3D500700016EB',\n", " '70B3D500700016F1',\n", " '70B3D500700016F2',\n", " '70B3D500700016F4',\n", " '70B3D500700016FC',\n", " '70B3D50070001704',\n", " '70B3D50070001706',\n", " '70B3D50070001709',\n", " '70B3D50070001710',\n", " '70B3D50070001712',\n", " '70B3D50070001716',\n", " '70B3D50070001725',\n", " '70B3D50070001726',\n", " '70B3D50070001727',\n", " '70B3D5007000172B',\n", " '70B3D5007000172C',\n", " '70B3D5007000172D',\n", " '70B3D5007000172E',\n", " '70B3D50070001730',\n", " '70B3D50070001734',\n", " '70B3D50070001738',\n", " '70B3D5007000173C',\n", " '70B3D50070001740',\n", " '70B3D50070001742',\n", " '70B3D50070001747',\n", " '70B3D5007000174F',\n", " '70B3D50070001770',\n", " '70B3D50070001779',\n", " '70B3D5007000177C',\n", " '70B3D50070001786']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Schwellwerte\n", "T_colecction_intervall=75\n", "T_number_collections=40\n", "\n", "data_all=data_all[data_all['collection_intervall']<=T_colecction_intervall]\n", "data_all=data_all[data_all['number_collections']<=T_number_collections]\n", "data_all=data_all.reset_index()\n", "container=list(data_all['container_id'])\n", "container" ] }, { "cell_type": "code", "execution_count": 28, "id": "01762362", "metadata": {}, "outputs": [], "source": [ "train_data=df[df['container_id']==container[0]]\n", "for item in container:\n", " if item == container[0]:\n", " None\n", " else: \n", " train_data=train_data.append(df[df['container_id']==item])\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 30, "id": "a1dcbff5", "metadata": {}, "outputs": [], "source": [ "data_path = r'..\\..\\data\\modeling\\train'\n", "train_data.to_csv(path_or_buf=data_path+'\\\\train_data.txt')" ] }, { "cell_type": "code", "execution_count": null, "id": "1b271bda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 5 }