Commit 3e9b5d0c authored by Oliver Wirth's avatar Oliver Wirth
Browse files
parents ee641617 af5babce
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b23a2e7d-3719-4a6a-8142-8904143d239e",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"import os\n",
"import pandas as pd\n",
"\n",
"from tqdm import tqdm\n",
"import numpy as np\n",
"from sklearn.metrics import f1_score\n",
"\n",
"# Parameter optimization\n",
"from skopt.space import Integer, Real, Categorical, Identity\n",
"from skopt.utils import use_named_args\n",
"from skopt import gp_minimize\n",
"from skopt.plots import plot_convergence\n",
"\n",
"# Model\n",
"from sklearn import svm\n",
"from sklearn.model_selection import cross_val_score\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import pickle as pkl\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "14ed4e25-21ca-4d3c-9f6e-2e2c06894a0d",
"metadata": {},
"outputs": [],
"source": [
"df_train = pkl.load(open(\"train_ten_best_features.pkl\" ,\"rb\"))\n",
"df_train.dropna(inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "af4e895b-9fb4-4251-bf00-710564d7addc",
"metadata": {},
"outputs": [],
"source": [
"df_test = pkl.load(open(\"test_ten_best_features.pkl\" ,\"rb\"))\n",
"df_test.dropna(inplace = True)"
]
},
{
"cell_type": "markdown",
"id": "e67dda33-9761-435f-84fa-d03652fb4fa9",
"metadata": {},
"source": [
"### LightGBM"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "23dccf02-779d-4210-9723-1dbca8d44c08",
"metadata": {},
"outputs": [],
"source": [
"import lightgbm as lgb\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.model_selection import GridSearchCV\n",
"import json\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "f66fc2b3-2666-46ef-80d2-d0d260b8a963",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7745966692414835"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.all().corr(df_test.all())"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "47d34961-594d-47bf-8fed-7c2bd15e2e4f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(33264, 11)"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "0fcf1140-299b-44fb-b81b-a999453f833a",
"metadata": {},
"outputs": [],
"source": [
"y_train = df_train[\"label\"]\n",
"y_test = df_test[\"label\"]\n",
"X_train = df_train.drop([\"label\"], axis=1)\n",
"X_test = df_test.drop([\"label\"], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "4efa2b64-9bda-4f0e-858a-815915b44732",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting training...\n",
"[1]\tvalid_0's l1: 0.499973\tvalid_0's l2: 0.250123\n",
"[2]\tvalid_0's l1: 0.500054\tvalid_0's l2: 0.25024\n",
"[3]\tvalid_0's l1: 0.500134\tvalid_0's l2: 0.25037\n",
"[4]\tvalid_0's l1: 0.500213\tvalid_0's l2: 0.250517\n",
"[5]\tvalid_0's l1: 0.500297\tvalid_0's l2: 0.25068\n",
"[6]\tvalid_0's l1: 0.500332\tvalid_0's l2: 0.250821\n",
"[7]\tvalid_0's l1: 0.500422\tvalid_0's l2: 0.251018\n",
"[8]\tvalid_0's l1: 0.500395\tvalid_0's l2: 0.25095\n",
"[9]\tvalid_0's l1: 0.500471\tvalid_0's l2: 0.251157\n",
"[10]\tvalid_0's l1: 0.50056\tvalid_0's l2: 0.251371\n",
"[11]\tvalid_0's l1: 0.500656\tvalid_0's l2: 0.251632\n",
"[12]\tvalid_0's l1: 0.500624\tvalid_0's l2: 0.251541\n",
"[13]\tvalid_0's l1: 0.500698\tvalid_0's l2: 0.251785\n",
"[14]\tvalid_0's l1: 0.500799\tvalid_0's l2: 0.25205\n",
"[15]\tvalid_0's l1: 0.50093\tvalid_0's l2: 0.252347\n",
"[16]\tvalid_0's l1: 0.501018\tvalid_0's l2: 0.252634\n",
"[17]\tvalid_0's l1: 0.501099\tvalid_0's l2: 0.252942\n",
"[18]\tvalid_0's l1: 0.501186\tvalid_0's l2: 0.253254\n",
"[19]\tvalid_0's l1: 0.501273\tvalid_0's l2: 0.253559\n",
"[20]\tvalid_0's l1: 0.501362\tvalid_0's l2: 0.253862\n",
"[21]\tvalid_0's l1: 0.501349\tvalid_0's l2: 0.253796\n",
"[22]\tvalid_0's l1: 0.501435\tvalid_0's l2: 0.254137\n",
"[23]\tvalid_0's l1: 0.501522\tvalid_0's l2: 0.254505\n",
"[24]\tvalid_0's l1: 0.501558\tvalid_0's l2: 0.254815\n",
"[25]\tvalid_0's l1: 0.501637\tvalid_0's l2: 0.255198\n",
"[26]\tvalid_0's l1: 0.501581\tvalid_0's l2: 0.255414\n",
"[27]\tvalid_0's l1: 0.501488\tvalid_0's l2: 0.255611\n",
"[28]\tvalid_0's l1: 0.501447\tvalid_0's l2: 0.25547\n",
"[29]\tvalid_0's l1: 0.501474\tvalid_0's l2: 0.255782\n",
"[30]\tvalid_0's l1: 0.501404\tvalid_0's l2: 0.256021\n",
"Starting predicting...\n",
"The rmse of prediction is: 0.5059851983827391\n",
"Feature importances: [0, 117, 110, 276, 32, 108, 234, 70, 60, 460, 3]\n"
]
}
],
"source": [
"# create dataset for lightgbm\n",
"lgb_train = lgb.Dataset(X_train, y_train)\n",
"lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)\n",
"\n",
"# specify your configurations as a dict\n",
"print('Starting training...')\n",
"# train\n",
"gbm = lgb.LGBMRegressor(num_leaves=50,\n",
" learning_rate=0.01,\n",
" n_estimators=30,\n",
" boosting_type = \"dart\")\n",
"gbm.fit(X_train, y_train,\n",
" eval_set=[(X_test, y_test)],\n",
" eval_metric='l1')\n",
"\n",
"print('Starting predicting...')\n",
"# predict\n",
"y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)\n",
"# eval\n",
"print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)\n",
"\n",
"# feature importances\n",
"print('Feature importances:', list(gbm.feature_importances_))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "356b6771-39fe-42ee-90f5-ec5b63b40bee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters found by grid search are: {'boosting_type': 'dart', 'learning_rate': 0.01, 'n_estimators': 30, 'num_leaves': 50}\n"
]
}
],
"source": [
"estimator = lgb.LGBMRegressor(num_leaves=31)\n",
"\n",
"param_grid = {\n",
" 'learning_rate': [0.01, 0.05, 0.1],\n",
" 'n_estimators': [20, 30, 40, 200],\n",
" 'num_leaves': [20, 31, 50]\n",
" 'boosting_type': ['gbdt', 'dart', 'goss', 'rf'] \n",
"}\n",
"\n",
"gbm = GridSearchCV(estimator, param_grid, cv=3)\n",
"gbm.fit(X_train, y_train)\n",
"\n",
"print('Best parameters found by grid search are:', gbm.best_params_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afe897d8-3050-4f9a-9b0c-9dac793baa3b",
"metadata": {},
"outputs": [],
"source": [
"## yikes"
]
},
{
"cell_type": "markdown",
"id": "fbf4bd8c-fbd0-4cc7-9aa3-c98969c9a55c",
"metadata": {},
"source": [
"### EASY NN"
]
},
{
"cell_type": "code",
"execution_count": 85,
"id": "cb955265-8cf0-430a-88ab-4708876a8147",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"\n",
"from sklearn.preprocessing import StandardScaler \n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix, classification_report"
]
},
{
"cell_type": "code",
"execution_count": 86,
"id": "f90bf39c-5781-4b27-98e2-dcd183f925ef",
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train)\n",
"X_test = scaler.transform(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"id": "f88fdd4c-d42b-4981-9729-a949c6780507",
"metadata": {},
"outputs": [],
"source": [
"EPOCHS = 50\n",
"BATCH_SIZE = 64\n",
"LEARNING_RATE = 0.001"
]
},
{
"cell_type": "code",
"execution_count": 88,
"id": "122bf29f-83bb-45ae-b78b-4808959fa58f",
"metadata": {},
"outputs": [],
"source": [
"class trainData(Dataset):\n",
" \n",
" def __init__(self, X_data, y_data):\n",
" self.X_data = X_data\n",
" self.y_data = y_data\n",
" \n",
" def __getitem__(self, index):\n",
" return self.X_data[index], self.y_data[index]\n",
" \n",
" def __len__ (self):\n",
" return len(self.X_data)\n",
"\n",
"\n",
"train_data = trainData(torch.FloatTensor(X_train), \n",
" torch.FloatTensor(y_train))"
]
},
{
"cell_type": "code",
"execution_count": 89,
"id": "c073a012-37fb-4424-ba66-fe344d00e34e",
"metadata": {},
"outputs": [],
"source": [
"class testData(Dataset):\n",
" \n",
" def __init__(self, X_data):\n",
" self.X_data = X_data\n",
" \n",
" def __getitem__(self, index):\n",
" return self.X_data[index]\n",
" \n",
" def __len__ (self):\n",
" return len(self.X_data)\n",
" \n",
"\n",
"test_data = testData(torch.FloatTensor(X_test))"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "6d8bdcf5-e60c-4379-8024-a13a3ebf4b75",
"metadata": {},
"outputs": [],
"source": [
"train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)\n",
"test_loader = DataLoader(dataset=test_data, batch_size=1)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"id": "1c8a9f8d-f758-4a08-9350-b38db859e1b4",
"metadata": {},
"outputs": [],
"source": [
"class binaryClassification(nn.Module):\n",
" def __init__(self):\n",
" super(binaryClassification, self).__init__()\n",
" # Number of input features is 11.\n",
" self.layer_1 = nn.Linear(11, 64) \n",
" self.layer_2 = nn.Linear(64, 64)\n",
" self.layer_out = nn.Linear(64, 1) \n",
" \n",
" self.relu = nn.ReLU()\n",
" self.dropout = nn.Dropout(p=0.1)\n",
" self.batchnorm1 = nn.BatchNorm1d(64)\n",
" self.batchnorm2 = nn.BatchNorm1d(64)\n",
" \n",
" def forward(self, inputs):\n",
" x = self.relu(self.layer_1(inputs))\n",
" x = self.batchnorm1(x)\n",
" x = self.relu(self.layer_2(x))\n",
" x = self.batchnorm2(x)\n",
" x = self.dropout(x)\n",
" x = self.layer_out(x)\n",
" \n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "3896791a-41eb-4787-a0c3-93bd21282c19",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cuda:0\n"
]
}
],
"source": [
"device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
"print(device)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"id": "7680c582-2d92-484c-bbd4-f22889f63647",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"binaryClassification(\n",
" (layer_1): Linear(in_features=11, out_features=64, bias=True)\n",
" (layer_2): Linear(in_features=64, out_features=64, bias=True)\n",
" (layer_out): Linear(in_features=64, out_features=1, bias=True)\n",
" (relu): ReLU()\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
")\n"
]
}
],
"source": [
"model = binaryClassification()\n",
"model.to(device)\n",
"print(model)\n",
"criterion = nn.BCEWithLogitsLoss()\n",
"optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)\n"
]
},
{
"cell_type": "code",
"execution_count": 95,
"id": "6746f086-fa63-4c1c-a49e-310d7e2ec4b7",
"metadata": {},
"outputs": [],
"source": [
"def binary_acc(y_pred, y_test):\n",
" y_pred_tag = torch.round(torch.sigmoid(y_pred))\n",
"\n",
" correct_results_sum = (y_pred_tag == y_test).sum().float()\n",
" acc = correct_results_sum/y_test.shape[0]\n",
" acc = torch.round(acc * 100)\n",
" \n",
" return acc"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "93f8739a-7dfa-4e34-9f66-81e942942d34",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 001: | Loss: 0.58836 | Acc: 67.731\n",
"Epoch 002: | Loss: 0.53872 | Acc: 71.175\n",
"Epoch 003: | Loss: 0.51944 | Acc: 72.415\n",
"Epoch 004: | Loss: 0.50751 | Acc: 72.885\n",
"Epoch 005: | Loss: 0.49957 | Acc: 73.498\n",
"Epoch 006: | Loss: 0.49470 | Acc: 73.840\n",
"Epoch 007: | Loss: 0.48752 | Acc: 74.038\n",
"Epoch 008: | Loss: 0.48339 | Acc: 74.573\n",
"Epoch 009: | Loss: 0.47710 | Acc: 74.723\n",
"Epoch 010: | Loss: 0.47275 | Acc: 75.062\n",
"Epoch 011: | Loss: 0.47231 | Acc: 75.313\n",
"Epoch 012: | Loss: 0.46792 | Acc: 75.733\n",
"Epoch 013: | Loss: 0.46326 | Acc: 75.588\n",
"Epoch 014: | Loss: 0.46432 | Acc: 75.515\n",
"Epoch 015: | Loss: 0.46245 | Acc: 75.508\n",
"Epoch 016: | Loss: 0.46001 | Acc: 75.812\n",
"Epoch 017: | Loss: 0.45738 | Acc: 75.963\n",
"Epoch 018: | Loss: 0.45510 | Acc: 76.138\n",
"Epoch 019: | Loss: 0.45753 | Acc: 75.990\n",
"Epoch 020: | Loss: 0.44936 | Acc: 76.512\n",
"Epoch 021: | Loss: 0.45486 | Acc: 76.019\n",
"Epoch 022: | Loss: 0.44876 | Acc: 76.538\n",
"Epoch 023: | Loss: 0.45166 | Acc: 76.363\n",
"Epoch 024: | Loss: 0.45067 | Acc: 76.340\n",
"Epoch 025: | Loss: 0.44514 | Acc: 76.667\n",
"Epoch 026: | Loss: 0.44440 | Acc: 76.779\n",
"Epoch 027: | Loss: 0.44393 | Acc: 76.531\n",
"Epoch 028: | Loss: 0.43909 | Acc: 76.902\n",
"Epoch 029: | Loss: 0.44011 | Acc: 77.085\n",
"Epoch 030: | Loss: 0.43999 | Acc: 76.960\n",
"Epoch 031: | Loss: 0.43862 | Acc: 76.775\n",
"Epoch 032: | Loss: 0.43671 | Acc: 77.077\n",
"Epoch 033: | Loss: 0.43578 | Acc: 77.098\n",
"Epoch 034: | Loss: 0.43612 | Acc: 77.173\n",
"Epoch 035: | Loss: 0.43450 | Acc: 77.048\n",
"Epoch 036: | Loss: 0.43584 | Acc: 76.885\n",
"Epoch 037: | Loss: 0.43497 | Acc: 77.185\n",
"Epoch 038: | Loss: 0.43394 | Acc: 77.125\n",
"Epoch 039: | Loss: 0.43347 | Acc: 77.196\n",
"Epoch 040: | Loss: 0.43379 | Acc: 77.235\n",
"Epoch 041: | Loss: 0.43027 | Acc: 77.267\n",
"Epoch 042: | Loss: 0.42924 | Acc: 77.325\n",
"Epoch 043: | Loss: 0.42890 | Acc: 77.556\n",
"Epoch 044: | Loss: 0.42649 | Acc: 77.806\n",
"Epoch 045: | Loss: 0.42746 | Acc: 77.508\n",
"Epoch 046: | Loss: 0.42936 | Acc: 77.365\n",
"Epoch 047: | Loss: 0.42697 | Acc: 77.654\n",
"Epoch 048: | Loss: 0.42456 | Acc: 77.692\n",
"Epoch 049: | Loss: 0.42648 | Acc: 77.763\n",
"Epoch 050: | Loss: 0.42833 | Acc: 77.573\n"
]
}
],
"source": [
"model.train()\n",
"for e in range(1, EPOCHS+1):\n",
" epoch_loss = 0\n",
" epoch_acc = 0\n",
" for X_batch, y_batch in train_loader:\n",
" X_batch, y_batch = X_batch.to(device), y_batch.to(device)\n",
" optimizer.zero_grad()\n",
" \n",
" y_pred = model(X_batch)\n",
" \n",
" loss = criterion(y_pred, y_batch.unsqueeze(1))\n",
" acc = binary_acc(y_pred, y_batch.unsqueeze(1))\n",
" \n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" epoch_loss += loss.item()\n",
" epoch_acc += acc.item()\n",
" \n",
"\n",
" print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "13260656-39d3-4bc7-91fe-e7f5fe09c6d2",
"metadata": {},
"outputs": [],
"source": [
"y_pred_list = []\n",
"model.eval()\n",
"with torch.no_grad():\n",
" for X_batch in test_loader:\n",
" X_batch = X_batch.to(device)\n",
" y_test_pred = model(X_batch)\n",
" y_test_pred = torch.sigmoid(y_test_pred)\n",
" y_pred_tag = torch.round(y_test_pred)\n",
" y_pred_list.append(y_pred_tag.cpu().numpy())\n",
"\n",
"y_pred_list = [a.squeeze().tolist() for a in y_pred_list]"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "1bd71613-25f7-4342-aa58-2abebce0f5f9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0.0 0.53 0.61 0.57 7533\n",
" 1.0 0.54 0.45 0.49 7394\n",
"\n",
" accuracy 0.53 14927\n",
" macro avg 0.53 0.53 0.53 14927\n",
"weighted avg 0.53 0.53 0.53 14927\n",
"\n"
]
}
],
"source": [
"print(classification_report(y_test, y_pred_list))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d2f8d98-c965-494a-b3c7-7b822e3ab6f0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
......@@ -694,9 +694,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:psda]",
"display_name": "Python 3",
"language": "python",
"name": "conda-env-psda-py"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
......@@ -708,7 +708,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.8.8"
},
"toc": {
"base_numbering": 1,
......@@ -725,5 +725,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment