Commit 522d9f0b authored by Oliver Wirth's avatar Oliver Wirth
Browse files

Add LSTM training routine

parent d7008417
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"from tqdm import tqdm\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"import torch\n",
"from torch import nn, optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from torchvision import transforms\n",
"\n",
"np.random.seed(42)\n",
"pd.set_option(\"display.max_columns\", None)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"# Data paths\n",
"data_path = Path('data')\n",
"train_path = data_path / 'train'\n",
"test_path = data_path / 'test'\n",
"model_path = Path('checkpoints')\n",
"\n",
"# Load labels\n",
"train_labels = pd.read_csv(f'{train_path}_label.csv')\n",
"test_labels = pd.read_csv(f'{test_path}_label.csv')\n",
"\n",
"# Merge train and test labels\n",
"all_labels = train_labels.append(test_labels, ignore_index = True)\n",
"all_labels = all_labels.dropna()\n",
"all_labels.ret = all_labels.ret.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Custom dataset per region\n",
"class RegionDataset(Dataset) :\n",
" \n",
" def __init__(self, region_path, labels, transform = None) :\n",
" super().__init__()\n",
" self.region = region_path.name\n",
" self.transform = transform\n",
" self.dfs = []\n",
" \n",
" for csv_path in region_path.iterdir() :\n",
" df = pd.read_csv(csv_path)\n",
" df = df.dropna()\n",
" df = df[(df.T != 0).any()]\n",
" \n",
" label = labels[labels.file_name == csv_path.name].ret.values\n",
" if df.shape[0] > 0 and len(label) == 1 :\n",
" self.dfs.append((df, label[0]))\n",
" \n",
" def __len__(self) :\n",
" return len(self.dfs)\n",
" \n",
" def __getitem__(self, idx) :\n",
" df, label = self.dfs[idx]\n",
" if self.transform :\n",
" df = self.transform(df)\n",
" return df, label"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Preprocessing transformations\n",
"# Ideas: PCA, truncated SVD, MinMaxScaling\n",
"transform = transforms.Compose([\n",
" StandardScaler().fit_transform,\n",
" torch.FloatTensor\n",
"])\n",
"\n",
"# Load train and test set\n",
"trainset = RegionDataset(train_path / '004', all_labels, transform = transform)\n",
"testset = RegionDataset(test_path / 'dummy', all_labels, transform = transform)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"# Split into train and validation set\n",
"holdout = .2\n",
"n_val = int(len(trainset) * .2)\n",
"n_train = len(trainset) - n_val\n",
"trainset, valset = torch.utils.data.dataset.random_split(trainset, [n_train, n_val])\n",
"\n",
"# Create data loader\n",
"batch_size = 1\n",
"trainloader = DataLoader(trainset, batch_size = batch_size, shuffle = True)\n",
"testloader = DataLoader(testset , batch_size = batch_size, shuffle = False)\n",
"valloader = DataLoader(valset , batch_size = batch_size, shuffle = False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# NN hyperparameters\n",
"input_size = 75\n",
"lstm_size = 64\n",
"hidden_size = 32\n",
"output_size = 2\n",
"dropout = .5\n",
"\n",
"# NN definition\n",
"class RNN(nn.Module) :\n",
" \n",
" def __init__(self) :\n",
" super().__init__()\n",
" self.rec = nn.LSTM(input_size = input_size, hidden_size = lstm_size, batch_first = True)\n",
" self.clf = nn.Sequential(\n",
" nn.Dropout(dropout),\n",
" nn.Linear(lstm_size, hidden_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" nn.Linear(hidden_size, output_size)\n",
" )\n",
" \n",
" def forward(self, x) :\n",
" x, _ = self.rec(x)\n",
" x = x[:,-1,:]\n",
" x = self.clf(x)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RNN(\n",
" (rec): LSTM(75, 64, batch_first=True)\n",
" (clf): Sequential(\n",
" (0): Linear(in_features=64, out_features=32, bias=True)\n",
" (1): ReLU()\n",
" (2): Dropout(p=0.5, inplace=False)\n",
" (3): Linear(in_features=32, out_features=2, bias=True)\n",
" )\n",
")"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Learning rate and starting epoch\n",
"lr = 2e-4\n",
"epoch = 0\n",
"\n",
"# Init model and optimizer\n",
"model = RNN()\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.Adam(model.parameters(), lr = lr)\n",
"\n",
"# CUDA\n",
"DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
"model.to(DEVICE)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"# Forward pass for a single batch\n",
"def forward_pass(model, samples, labels, criterion, optimizer = None) :\n",
" out = model(samples)\n",
" pred = out.argmax(dim = 1)\n",
" loss = criterion(out, labels)\n",
"\n",
" if optimizer :\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss = loss.item() * samples.size(0)\n",
" correct = (pred == labels).sum().item()\n",
" return loss, correct\n",
"\n",
"# Forward pass for whole dataset\n",
"def train_loop(model, loader, criterion, optimizer = None) :\n",
" model.train(optimizer is not None)\n",
" running_loss = 0\n",
" running_correct = 0\n",
" \n",
" for samples, labels in loader :\n",
" samples, labels = samples.to(DEVICE), labels.to(DEVICE)\n",
" loss, correct = forward_pass(model, samples, labels, criterion, optimizer)\n",
" running_loss += loss\n",
" running_correct += correct\n",
" \n",
" return running_loss, running_correct"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"epochs = 20\n",
"loss_stats = pd.DataFrame(columns = ['train', 'val'])\n",
"acc_stats = pd.DataFrame(columns = ['train', 'val'])\n",
"\n",
"# Train model\n",
"for epoch in range(epoch, epoch + epochs) :\n",
" print(f'=== Epoch {(epoch + 1):2} ===')\n",
" \n",
" # Train loop\n",
" loss, correct = train_loop(model, trainloader, criterion, optimizer)\n",
" train_loss = loss / len(trainset)\n",
" train_acc = correct / len(trainset)\n",
" print(f' training loss = {train_loss:.4f}, acc = {train_acc:.4f}')\n",
" \n",
" # Validation loop\n",
" loss, correct = train_loop(model, valloader, criterion)\n",
" val_loss = loss / len(valset)\n",
" val_acc = correct / len(valset)\n",
" print(f'validation loss = {val_loss:.4f}, acc = {val_acc:.4f}')\n",
" \n",
" # Statistics\n",
" loss_stats = loss_stats.append({\n",
" 'train': train_loss,\n",
" 'val': val_loss\n",
" }, ignore_index = True)\n",
" acc_stats = acc_stats.append({\n",
" 'train': train_acc,\n",
" 'val': val_acc\n",
" }, ignore_index = True)\n",
" \n",
" # Save best model\n",
" if loss_stats['val'].idxmin() == len(loss_stats) - 1 :\n",
" torch.save(model, model_path / f'model_{model.__class__.__name__}_e{epoch + 1}.pt')\n",
" \n",
"epoch += 1"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation after 20 epochs: loss = 0.7293, acc = 0.4805\n"
]
}
],
"source": [
"# Evaluate trained model on test set\n",
"running_loss, running_correct = train_loop(model, testloader, criterion)\n",
"n = len(testset)\n",
"print(f'Evaluation after {epoch} epochs: loss = {(running_loss / n):.4f}, acc = {(running_correct / n):.4f}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:psda]",
"language": "python",
"name": "conda-env-psda-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment