Commit 05aba3d2 authored by PauTheu's avatar PauTheu
Browse files

added tpot

parent af5babce
.ipynb_checkpoints
.train
.test
\ No newline at end of file
# WindTurbinePrediction
## Others: Versuch_Paul.ipynb:
Data: use ten best features explored in data_exploration.ipynb file
### Approaches:
1. Lightgbm: leaf wise tree growth (best first) instead of level wise tree growth. low acc.
2. Easy NN: 2 hidden layers, batches 64, epochs 50, binary classification, Relu, ADAM. low acc.
3. TPOPT: looks for the best classification pipeline. Currently only on mean feature data.
......@@ -32,7 +32,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 2,
"id": "14ed4e25-21ca-4d3c-9f6e-2e2c06894a0d",
"metadata": {},
"outputs": [],
......@@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 3,
"id": "af4e895b-9fb4-4251-bf00-710564d7addc",
"metadata": {},
"outputs": [],
......@@ -52,6 +52,19 @@
"df_test.dropna(inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0fcf1140-299b-44fb-b81b-a999453f833a",
"metadata": {},
"outputs": [],
"source": [
"y_train = df_train[\"label\"]\n",
"y_test = df_test[\"label\"]\n",
"X_train = df_train.drop([\"label\"], axis=1)\n",
"X_test = df_test.drop([\"label\"], axis=1)"
]
},
{
"cell_type": "markdown",
"id": "e67dda33-9761-435f-84fa-d03652fb4fa9",
......@@ -62,7 +75,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": null,
"id": "23dccf02-779d-4210-9723-1dbca8d44c08",
"metadata": {},
"outputs": [],
......@@ -116,19 +129,6 @@
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0fcf1140-299b-44fb-b81b-a999453f833a",
"metadata": {},
"outputs": [],
"source": [
"y_train = df_train[\"label\"]\n",
"y_test = df_test[\"label\"]\n",
"X_train = df_train.drop([\"label\"], axis=1)\n",
"X_test = df_test.drop([\"label\"], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 75,
......@@ -252,7 +252,7 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": null,
"id": "cb955265-8cf0-430a-88ab-4708876a8147",
"metadata": {},
"outputs": [],
......@@ -269,7 +269,7 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": 6,
"id": "f90bf39c-5781-4b27-98e2-dcd183f925ef",
"metadata": {},
"outputs": [],
......@@ -281,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": 87,
"execution_count": 7,
"id": "f88fdd4c-d42b-4981-9729-a949c6780507",
"metadata": {},
"outputs": [],
......@@ -293,7 +293,7 @@
},
{
"cell_type": "code",
"execution_count": 88,
"execution_count": 8,
"id": "122bf29f-83bb-45ae-b78b-4808959fa58f",
"metadata": {},
"outputs": [],
......@@ -317,7 +317,7 @@
},
{
"cell_type": "code",
"execution_count": 89,
"execution_count": 9,
"id": "c073a012-37fb-4424-ba66-fe344d00e34e",
"metadata": {},
"outputs": [],
......@@ -339,7 +339,7 @@
},
{
"cell_type": "code",
"execution_count": 90,
"execution_count": 10,
"id": "6d8bdcf5-e60c-4379-8024-a13a3ebf4b75",
"metadata": {},
"outputs": [],
......@@ -350,7 +350,7 @@
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 18,
"id": "1c8a9f8d-f758-4a08-9350-b38db859e1b4",
"metadata": {},
"outputs": [],
......@@ -361,18 +361,22 @@
" # Number of input features is 11.\n",
" self.layer_1 = nn.Linear(11, 64) \n",
" self.layer_2 = nn.Linear(64, 64)\n",
" self.layer_3 = nn.Linear(64, 64)\n",
" self.layer_out = nn.Linear(64, 1) \n",
" \n",
" self.relu = nn.ReLU()\n",
" self.dropout = nn.Dropout(p=0.1)\n",
" self.batchnorm1 = nn.BatchNorm1d(64)\n",
" self.batchnorm2 = nn.BatchNorm1d(64)\n",
" self.batchnorm3 = nn.BatchNorm1d(64)\n",
" \n",
" def forward(self, inputs):\n",
" x = self.relu(self.layer_1(inputs))\n",
" x = self.batchnorm1(x)\n",
" x = self.relu(self.layer_2(x))\n",
" x = self.batchnorm2(x)\n",
" x = self.relu(self.layer_3(x))\n",
" x = self.batchnorm3(x)\n",
" x = self.dropout(x)\n",
" x = self.layer_out(x)\n",
" \n",
......@@ -381,7 +385,7 @@
},
{
"cell_type": "code",
"execution_count": 92,
"execution_count": 19,
"id": "3896791a-41eb-4787-a0c3-93bd21282c19",
"metadata": {},
"outputs": [
......@@ -400,7 +404,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": 20,
"id": "7680c582-2d92-484c-bbd4-f22889f63647",
"metadata": {},
"outputs": [
......@@ -411,11 +415,13 @@
"binaryClassification(\n",
" (layer_1): Linear(in_features=11, out_features=64, bias=True)\n",
" (layer_2): Linear(in_features=64, out_features=64, bias=True)\n",
" (layer_3): Linear(in_features=64, out_features=64, bias=True)\n",
" (layer_out): Linear(in_features=64, out_features=1, bias=True)\n",
" (relu): ReLU()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
")\n"
]
}
......@@ -430,7 +436,7 @@
},
{
"cell_type": "code",
"execution_count": 95,
"execution_count": 21,
"id": "6746f086-fa63-4c1c-a49e-310d7e2ec4b7",
"metadata": {},
"outputs": [],
......@@ -447,7 +453,7 @@
},
{
"cell_type": "code",
"execution_count": 96,
"execution_count": 22,
"id": "93f8739a-7dfa-4e34-9f66-81e942942d34",
"metadata": {},
"outputs": [
......@@ -455,56 +461,56 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 001: | Loss: 0.58836 | Acc: 67.731\n",
"Epoch 002: | Loss: 0.53872 | Acc: 71.175\n",
"Epoch 003: | Loss: 0.51944 | Acc: 72.415\n",
"Epoch 004: | Loss: 0.50751 | Acc: 72.885\n",
"Epoch 005: | Loss: 0.49957 | Acc: 73.498\n",
"Epoch 006: | Loss: 0.49470 | Acc: 73.840\n",
"Epoch 007: | Loss: 0.48752 | Acc: 74.038\n",
"Epoch 008: | Loss: 0.48339 | Acc: 74.573\n",
"Epoch 009: | Loss: 0.47710 | Acc: 74.723\n",
"Epoch 010: | Loss: 0.47275 | Acc: 75.062\n",
"Epoch 011: | Loss: 0.47231 | Acc: 75.313\n",
"Epoch 012: | Loss: 0.46792 | Acc: 75.733\n",
"Epoch 013: | Loss: 0.46326 | Acc: 75.588\n",
"Epoch 014: | Loss: 0.46432 | Acc: 75.515\n",
"Epoch 015: | Loss: 0.46245 | Acc: 75.508\n",
"Epoch 016: | Loss: 0.46001 | Acc: 75.812\n",
"Epoch 017: | Loss: 0.45738 | Acc: 75.963\n",
"Epoch 018: | Loss: 0.45510 | Acc: 76.138\n",
"Epoch 019: | Loss: 0.45753 | Acc: 75.990\n",
"Epoch 020: | Loss: 0.44936 | Acc: 76.512\n",
"Epoch 021: | Loss: 0.45486 | Acc: 76.019\n",
"Epoch 022: | Loss: 0.44876 | Acc: 76.538\n",
"Epoch 023: | Loss: 0.45166 | Acc: 76.363\n",
"Epoch 024: | Loss: 0.45067 | Acc: 76.340\n",
"Epoch 025: | Loss: 0.44514 | Acc: 76.667\n",
"Epoch 026: | Loss: 0.44440 | Acc: 76.779\n",
"Epoch 027: | Loss: 0.44393 | Acc: 76.531\n",
"Epoch 028: | Loss: 0.43909 | Acc: 76.902\n",
"Epoch 029: | Loss: 0.44011 | Acc: 77.085\n",
"Epoch 030: | Loss: 0.43999 | Acc: 76.960\n",
"Epoch 031: | Loss: 0.43862 | Acc: 76.775\n",
"Epoch 032: | Loss: 0.43671 | Acc: 77.077\n",
"Epoch 033: | Loss: 0.43578 | Acc: 77.098\n",
"Epoch 034: | Loss: 0.43612 | Acc: 77.173\n",
"Epoch 035: | Loss: 0.43450 | Acc: 77.048\n",
"Epoch 036: | Loss: 0.43584 | Acc: 76.885\n",
"Epoch 037: | Loss: 0.43497 | Acc: 77.185\n",
"Epoch 038: | Loss: 0.43394 | Acc: 77.125\n",
"Epoch 039: | Loss: 0.43347 | Acc: 77.196\n",
"Epoch 040: | Loss: 0.43379 | Acc: 77.235\n",
"Epoch 041: | Loss: 0.43027 | Acc: 77.267\n",
"Epoch 042: | Loss: 0.42924 | Acc: 77.325\n",
"Epoch 043: | Loss: 0.42890 | Acc: 77.556\n",
"Epoch 044: | Loss: 0.42649 | Acc: 77.806\n",
"Epoch 045: | Loss: 0.42746 | Acc: 77.508\n",
"Epoch 046: | Loss: 0.42936 | Acc: 77.365\n",
"Epoch 047: | Loss: 0.42697 | Acc: 77.654\n",
"Epoch 048: | Loss: 0.42456 | Acc: 77.692\n",
"Epoch 049: | Loss: 0.42648 | Acc: 77.763\n",
"Epoch 050: | Loss: 0.42833 | Acc: 77.573\n"
"Epoch 001: | Loss: 0.57255 | Acc: 68.510\n",
"Epoch 002: | Loss: 0.51966 | Acc: 72.412\n",
"Epoch 003: | Loss: 0.50320 | Acc: 72.960\n",
"Epoch 004: | Loss: 0.48726 | Acc: 73.865\n",
"Epoch 005: | Loss: 0.48121 | Acc: 74.381\n",
"Epoch 006: | Loss: 0.47483 | Acc: 74.637\n",
"Epoch 007: | Loss: 0.46612 | Acc: 75.467\n",
"Epoch 008: | Loss: 0.45879 | Acc: 75.863\n",
"Epoch 009: | Loss: 0.45398 | Acc: 75.887\n",
"Epoch 010: | Loss: 0.45117 | Acc: 76.433\n",
"Epoch 011: | Loss: 0.44418 | Acc: 76.796\n",
"Epoch 012: | Loss: 0.44035 | Acc: 76.696\n",
"Epoch 013: | Loss: 0.43966 | Acc: 76.646\n",
"Epoch 014: | Loss: 0.43620 | Acc: 76.937\n",
"Epoch 015: | Loss: 0.43335 | Acc: 77.183\n",
"Epoch 016: | Loss: 0.43053 | Acc: 77.279\n",
"Epoch 017: | Loss: 0.42891 | Acc: 77.277\n",
"Epoch 018: | Loss: 0.42899 | Acc: 77.281\n",
"Epoch 019: | Loss: 0.42436 | Acc: 77.512\n",
"Epoch 020: | Loss: 0.42328 | Acc: 77.627\n",
"Epoch 021: | Loss: 0.42370 | Acc: 77.792\n",
"Epoch 022: | Loss: 0.42102 | Acc: 77.644\n",
"Epoch 023: | Loss: 0.42057 | Acc: 77.800\n",
"Epoch 024: | Loss: 0.41770 | Acc: 77.879\n",
"Epoch 025: | Loss: 0.41552 | Acc: 78.081\n",
"Epoch 026: | Loss: 0.41711 | Acc: 77.954\n",
"Epoch 027: | Loss: 0.41543 | Acc: 77.835\n",
"Epoch 028: | Loss: 0.40957 | Acc: 78.260\n",
"Epoch 029: | Loss: 0.41276 | Acc: 78.279\n",
"Epoch 030: | Loss: 0.40988 | Acc: 78.279\n",
"Epoch 031: | Loss: 0.41333 | Acc: 78.271\n",
"Epoch 032: | Loss: 0.40900 | Acc: 78.362\n",
"Epoch 033: | Loss: 0.40603 | Acc: 78.381\n",
"Epoch 034: | Loss: 0.40833 | Acc: 78.404\n",
"Epoch 035: | Loss: 0.40930 | Acc: 78.338\n",
"Epoch 036: | Loss: 0.40871 | Acc: 78.440\n",
"Epoch 037: | Loss: 0.40228 | Acc: 78.644\n",
"Epoch 038: | Loss: 0.39928 | Acc: 78.852\n",
"Epoch 039: | Loss: 0.40197 | Acc: 79.058\n",
"Epoch 040: | Loss: 0.39958 | Acc: 78.588\n",
"Epoch 041: | Loss: 0.40289 | Acc: 78.994\n",
"Epoch 042: | Loss: 0.39919 | Acc: 78.838\n",
"Epoch 043: | Loss: 0.39836 | Acc: 78.873\n",
"Epoch 044: | Loss: 0.39842 | Acc: 78.838\n",
"Epoch 045: | Loss: 0.39995 | Acc: 78.815\n",
"Epoch 046: | Loss: 0.39698 | Acc: 78.896\n",
"Epoch 047: | Loss: 0.39460 | Acc: 78.965\n",
"Epoch 048: | Loss: 0.39443 | Acc: 79.200\n",
"Epoch 049: | Loss: 0.39654 | Acc: 79.104\n",
"Epoch 050: | Loss: 0.39124 | Acc: 79.223\n"
]
}
],
......@@ -534,7 +540,7 @@
},
{
"cell_type": "code",
"execution_count": 97,
"execution_count": 23,
"id": "13260656-39d3-4bc7-91fe-e7f5fe09c6d2",
"metadata": {},
"outputs": [],
......@@ -554,7 +560,7 @@
},
{
"cell_type": "code",
"execution_count": 98,
"execution_count": 24,
"id": "1bd71613-25f7-4342-aa58-2abebce0f5f9",
"metadata": {},
"outputs": [
......@@ -564,12 +570,12 @@
"text": [
" precision recall f1-score support\n",
"\n",
" 0.0 0.53 0.61 0.57 7533\n",
" 1.0 0.54 0.45 0.49 7394\n",
" 0.0 0.52 0.60 0.56 7533\n",
" 1.0 0.51 0.43 0.47 7394\n",
"\n",
" accuracy 0.53 14927\n",
" macro avg 0.53 0.53 0.53 14927\n",
"weighted avg 0.53 0.53 0.53 14927\n",
" accuracy 0.52 14927\n",
" macro avg 0.52 0.52 0.51 14927\n",
"weighted avg 0.52 0.52 0.51 14927\n",
"\n"
]
}
......@@ -578,10 +584,226 @@
"print(classification_report(y_test, y_pred_list))"
]
},
{
"cell_type": "markdown",
"id": "55bbaee2-335c-40ed-a5ad-3a616c91773f",
"metadata": {},
"source": [
"### TPOPT"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "5b393d8b-f625-4a26-b4e3-fa8b9e15dba6",
"metadata": {},
"outputs": [],
"source": [
"from tpot import TPOTClassifier"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5efe5da7-3642-4c68-a8bf-c8e093c937a6",
"metadata": {},
"outputs": [],
"source": [
"df_train = pkl.load(open(\"./data/train\" ,\"rb\"))\n",
"df_test = pkl.load(open(\"./data/test\" ,\"rb\"))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5c0b7f2e-2da4-4538-8e28-e9d06746c1ec",
"metadata": {},
"outputs": [],
"source": [
"y_train = df_train[\"ret\"]\n",
"y_test = df_test[\"ret\"]\n",
"X_train = df_train.drop([\"ret\"], axis=1)\n",
"X_test = df_test.drop([\"ret\"], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a17b4a46-ddc4-451a-985b-4cf9f4e54ae7",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Optimization Progress: 0%| | 0/300 [00:00<?, ?pipeline/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generation 1 - Current best internal CV score: 0.6669664268585132\n",
"\n",
"Generation 2 - Current best internal CV score: 0.66810551558753\n",
"\n",
"Generation 3 - Current best internal CV score: 0.6792565947242207\n",
"\n",
"Generation 4 - Current best internal CV score: 0.6792565947242207\n",
"\n",
"Generation 5 - Current best internal CV score: 0.6792565947242207\n",
"\n",
"Best pipeline: MLPClassifier(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.15000000000000002, min_samples_leaf=4, min_samples_split=17, n_estimators=100), alpha=0.1, learning_rate_init=0.001)\n",
"0.512618507143811\n"
]
}
],
"source": [
"tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42, n_jobs = -1)\n",
"tpot.fit(X_train, y_train)\n",
"print(tpot.score(X_test, y_test))"
]
},
{
"cell_type": "markdown",
"id": "22625cc8-5113-40b1-8d46-bc7289d34487",
"metadata": {},
"source": [
"### Other data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "42ad79c1-3be2-41ae-b673-93c2fb20725d",
"metadata": {},
"outputs": [],
"source": [
"df_test = pkl.load(open(\"test_ten_best_features.pkl\" ,\"rb\"))\n",
"df_test.dropna(inplace = True)\n",
"df_train = pkl.load(open(\"train_ten_best_features.pkl\" ,\"rb\"))\n",
"df_train.dropna(inplace = True)\n",
"y_train = df_train[\"label\"]\n",
"y_test = df_test[\"label\"]\n",
"X_train = df_train.drop([\"label\"], axis=1)\n",
"X_test = df_test.drop([\"label\"], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2adbd233-18f1-4fc6-97b0-0d75ae7f5544",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5f30ecffec584948a8684ee2be153921",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Optimization Progress: 0%| | 0/50 [00:00<?, ?pipeline/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generation 1 - Current best internal CV score: 0.6320305182449035\n",
"\n",
"Generation 2 - Current best internal CV score: 0.6351871562198599\n",
"\n",
"Generation 3 - Current best internal CV score: 0.6382534647018571\n",
"\n",
"Generation 4 - Current best internal CV score: 0.6421921613992991\n",
"\n",
"Generation 5 - Current best internal CV score: 0.6421921613992991\n",
"\n",
"Generation 6 - Current best internal CV score: 0.6438173782411489\n",
"\n",
"Generation 7 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 8 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 9 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 10 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 11 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 12 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 13 - Current best internal CV score: 0.6466120971925099\n",
"\n",
"Generation 14 - Current best internal CV score: 0.6504599989208184\n",
"\n",
"Generation 15 - Current best internal CV score: 0.6504599989208184\n",
"\n",
"Generation 16 - Current best internal CV score: 0.6547001027391782\n",
"\n",
"Generation 17 - Current best internal CV score: 0.6547001027391782\n",
"\n",
"Generation 18 - Current best internal CV score: 0.6547001027391782\n",
"\n",
"Generation 19 - Current best internal CV score: 0.6547001027391782\n",
"\n",
"Generation 20 - Current best internal CV score: 0.6547001027391782\n",
"\n",
"Generation 21 - Current best internal CV score: 0.6547001027391782\n",
"\n",
"\n",
"TPOT closed during evaluation in one generation.\n",
"WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.\n",
"\n",
"\n",
"TPOT closed prematurely. Will use the current best pipeline.\n",
"\n",
"Best pipeline: RandomForestClassifier(PCA(FastICA(ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=gini, max_features=0.7500000000000001, min_samples_leaf=3, min_samples_split=16, n_estimators=100), tol=0.2), iterated_power=6, svd_solver=randomized), bootstrap=True, criterion=entropy, max_features=0.15000000000000002, min_samples_leaf=14, min_samples_split=15, n_estimators=100)\n",
"0.521337174248007\n"
]
}
],
"source": [
"tpot = TPOTClassifier(generations=100, population_size=50, verbosity=2, random_state=42, n_jobs = -1, max_time_mins= 120)\n",
"tpot.fit(X_train, y_train) \n",
"print(tpot.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "801d72f6-5b32-4129-a7c3-0661b542728a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.521337174248007\n"
]
}
],
"source": [
"print(tpot.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d2f8d98-c965-494a-b3c7-7b822e3ab6f0",
"id": "973c03f8-0463-4951-9475-6d0eb97e74b1",
"metadata": {},
"outputs": [],
"source": []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment