Commit e777529e authored by Oliver Wirth's avatar Oliver Wirth
Browse files

Add training for all regions and ensemble model

parent 47f73752
......@@ -63,6 +63,9 @@
"metadata": {},
"outputs": [],
"source": [
"# Minimal time series length after cleaning\n",
"min_series_length = 60\n",
"\n",
"# Custom dataset per region\n",
"class RegionDataset(Dataset) :\n",
" \n",
......@@ -78,7 +81,7 @@
" df = df[(df.T != 0).any()]\n",
" \n",
" label = labels[labels.file_name == csv_path.name].ret.values\n",
" if df.shape[0] > 0 and len(label) == 1 :\n",
" if len(df) >= min_series_length and len(label) == 1 :\n",
" self.dfs.append((df, label[0]))\n",
" \n",
" def __len__(self) :\n",
......@@ -117,7 +120,7 @@
"source": [
"# Split into train and validation set\n",
"holdout = .2\n",
"n_val = int(len(trainset) * .2)\n",
"n_val = int(len(trainset) * holdout)\n",
"n_train = len(trainset) - n_val\n",
"trainset, valset = random_split(trainset, [n_train, n_val])\n",
"\n",
......
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import datetime\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from tqdm import tqdm\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"import torch\n",
"from torch import nn, optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from torch.utils.data.dataset import random_split\n",
"from torch.nn.utils.rnn import pack_sequence, pad_sequence\n",
"from torchvision import transforms\n",
"\n",
"np.random.seed(42)\n",
"pd.set_option(\"display.max_columns\", None)\n",
"sns.set_theme()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset and paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Data paths\n",
"data_path = Path('data')\n",
"train_path = data_path / 'train'\n",
"test_path = data_path / 'test'\n",
"runs_path = Path('runs')\n",
"\n",
"# Load labels\n",
"train_labels = pd.read_csv(f'{train_path}_label.csv')\n",
"test_labels = pd.read_csv(f'{test_path}_label.csv')\n",
"\n",
"# Merge train and test labels\n",
"all_labels = train_labels.append(test_labels, ignore_index = True)\n",
"all_labels = all_labels.dropna()\n",
"all_labels.ret = all_labels.ret.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Minimal time series length after cleaning\n",
"min_series_length = 60\n",
"\n",
"# Custom dataset per region\n",
"class RegionDataset(Dataset) :\n",
" \n",
" def __init__(self, region_path, labels, transform = None) :\n",
" super().__init__()\n",
" self.region = region_path.name\n",
" self.transform = transform\n",
" self.dfs = []\n",
" \n",
" for csv_path in region_path.iterdir() :\n",
" df = pd.read_csv(csv_path)\n",
" df = df.dropna()\n",
" df = df[(df.T != 0).any()]\n",
" \n",
" label = labels[labels.file_name == csv_path.name].ret.values\n",
" if len(df) >= min_series_length and len(label) == 1 :\n",
" self.dfs.append((df, label[0]))\n",
" \n",
" def __len__(self) :\n",
" return len(self.dfs)\n",
" \n",
" def __getitem__(self, idx) :\n",
" df, label = self.dfs[idx]\n",
" if self.transform :\n",
" df = self.transform(df)\n",
" return df, label"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Region model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# NN hyperparameters\n",
"input_size = 75\n",
"conv1_size = 64\n",
"conv1_kernel = 7\n",
"conv2_size = 64\n",
"conv2_kernel = 7\n",
"conv3_size = 64\n",
"conv3_kernel = 7\n",
"conv_stride = 2\n",
"pool_size = 16\n",
"hidden_size = 32\n",
"output_size = 2\n",
"dropout = .4\n",
"\n",
"# Separable convolution layer\n",
"class SepConv1d(nn.Module) :\n",
" \n",
" def __init__(self, in_channels, out_channels, kernel_size, stride = 1, padding = 0) :\n",
" super().__init__()\n",
" self.depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups = in_channels)\n",
" self.pointwise_conv = nn.Conv1d(in_channels, out_channels, kernel_size = 1)\n",
" \n",
" def forward(self, x) :\n",
" x = self.depthwise_conv(x)\n",
" x = self.pointwise_conv(x)\n",
" return x\n",
"\n",
"# CNN with separable 1D convolutions\n",
"class BNSepCNN(nn.Module) :\n",
" \n",
" def __init__(self) :\n",
" super().__init__()\n",
" self.conv = nn.Sequential(\n",
" SepConv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),\n",
" nn.BatchNorm1d(conv1_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" SepConv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),\n",
" nn.BatchNorm1d(conv2_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" SepConv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),\n",
" nn.BatchNorm1d(conv3_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" nn.AdaptiveMaxPool1d(pool_size)\n",
" )\n",
" self.fc = nn.Sequential(\n",
" nn.Linear(conv3_size * pool_size, hidden_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" nn.Linear(hidden_size, output_size)\n",
" )\n",
" \n",
" def forward(self, x) :\n",
" x = x.transpose(1, 2)\n",
" x = self.conv(x)\n",
" x = x.view(x.size(0), -1)\n",
" x = self.fc(x)\n",
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create run directory\n",
"run_path = runs_path / str(datetime.datetime.now())\n",
"run_path.mkdir()\n",
"\n",
"# CUDA\n",
"use_cuda = False\n",
"DEVICE = torch.device('cuda:0' if use_cuda and torch.cuda.is_available() else 'cpu')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Forward pass for a single batch\n",
"def forward_pass(model, samples, labels, criterion, optimizer = None) :\n",
" out = model(samples)\n",
" pred = out.argmax(dim = 1)\n",
" loss = criterion(out, labels)\n",
"\n",
" if optimizer :\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss = loss.item() * labels.size(0)\n",
" correct = (pred == labels).sum().item()\n",
" return loss, correct\n",
"\n",
"# Forward pass for whole dataset\n",
"def train_loop(model, loader, criterion, optimizer = None) :\n",
" model.train(optimizer is not None)\n",
" running_loss = 0\n",
" running_correct = 0\n",
" \n",
" for samples, labels in loader :\n",
" samples, labels = samples.to(DEVICE), labels.to(DEVICE)\n",
" loss, correct = forward_pass(model, samples, labels, criterion, optimizer)\n",
" running_loss += loss\n",
" running_correct += correct\n",
" \n",
" return running_loss, running_correct"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Preprocessing transformations\n",
"# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection\n",
"transform = transforms.Compose([\n",
" StandardScaler().fit_transform,\n",
" torch.FloatTensor\n",
"])\n",
"\n",
"# Crop sequences to same length\n",
"def collate_crop(batch) :\n",
" samples, labels = zip(*batch)\n",
" length = min(x.size(0) for x in samples)\n",
" samples = torch.stack([x[-length:] for x in samples])\n",
" labels = torch.tensor(labels)\n",
" return samples, labels\n",
"collate_fn = collate_crop\n",
"\n",
"# Dataset params\n",
"holdout = .1\n",
"batch_size = 8\n",
"\n",
"# Learning rate and max epochs\n",
"lr = 1e-3\n",
"epochs = 500\n",
"\n",
"# LR scheduler\n",
"scheduler_warmup = 30\n",
"break_on_min_lr = True\n",
"min_lr = 1e-5\n",
"factor = .5\n",
"patience = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load region dataset, train and save model\n",
"def train_routine(region_path) :\n",
" print(f'=== Region {region_path.name} ===')\n",
" print('Loading dataset...')\n",
" trainset = RegionDataset(region_path, all_labels, transform = transform)\n",
" \n",
" # Split into train and validation set\n",
" n_val = int(len(trainset) * holdout)\n",
" n_train = len(trainset) - n_val\n",
" trainset, valset = random_split(trainset, [n_train, n_val])\n",
" trainloader = DataLoader(trainset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)\n",
" valloader = DataLoader(valset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)\n",
" \n",
" # Init model and optimizer\n",
" model = BNSepCNN()\n",
" model = model.to(DEVICE)\n",
" criterion = nn.CrossEntropyLoss()\n",
" optimizer = optim.Adam(model.parameters(), lr = lr)\n",
" scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = factor, patience = patience, verbose = False)\n",
" \n",
" # Loss/accuracy logs\n",
" loss_stats = pd.DataFrame(columns = ['train', 'val'])\n",
" acc_stats = pd.DataFrame(columns = ['train', 'val'])\n",
" \n",
" # Create model run directory\n",
" region_run_path = run_path / region_path.name\n",
" region_run_path.mkdir()\n",
" with open(region_run_path / 'model_architecture.txt', 'w') as f:\n",
" f.write(f\"\"\"{model}\n",
"\n",
" {optimizer}\n",
" batch_size: {batch_size}\n",
" factor: {factor}\n",
" patience: {patience}\n",
" scheduler_warmup: {scheduler_warmup}\"\"\")\n",
" \n",
" # Train model\n",
" print('Training model...')\n",
" for epoch in range(epochs) :\n",
" # Train loop\n",
" loss, correct = train_loop(model, trainloader, criterion, optimizer)\n",
" train_loss = loss / len(trainset)\n",
" train_acc = correct / len(trainset)\n",
" \n",
" # Validation loop\n",
" loss, correct = train_loop(model, valloader, criterion)\n",
" val_loss = loss / len(valset)\n",
" val_acc = correct / len(valset)\n",
"\n",
" # Statistics\n",
" loss_stats = loss_stats.append({\n",
" 'train': train_loss,\n",
" 'val': val_loss\n",
" }, ignore_index = True)\n",
" acc_stats = acc_stats.append({\n",
" 'train': train_acc,\n",
" 'val': val_acc\n",
" }, ignore_index = True)\n",
"\n",
" # Save best model\n",
" if loss_stats['val'].idxmin() == len(loss_stats) - 1 :\n",
" torch.save(model, region_run_path / 'model_best.pt')\n",
"\n",
" # Schedule learning rate after warmup period\n",
" if epoch >= scheduler_warmup :\n",
" scheduler.step(val_loss)\n",
" current_lr = optimizer.param_groups[0]['lr']\n",
" if break_on_min_lr and current_lr < min_lr :\n",
" break\n",
" torch.save(model, region_run_path / 'model_last.pt')\n",
" \n",
" # Loss/accuracy plot\n",
" fig, ax = plt.subplots(1, 2, figsize = (16, 6))\n",
" g = sns.lineplot(data = loss_stats, ax = ax[0])\n",
" g = sns.lineplot(data = acc_stats, ax = ax[1])\n",
" ax[0].set_title('CE loss')\n",
" ax[1].set_title('accuracy')\n",
" ax[0].set_xlabel('epoch')\n",
" ax[1].set_xlabel('epoch')\n",
" fig.savefig(region_run_path / 'loss_acc_plot.png')\n",
" print(f'best validation acc = {acc_stats[\"val\"].max()}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Exclude already trained models (from 2021-06-04 14:23:10.737430)\n",
"blacklist = ['006', '018', '020', '049', '052', '057', '060', '064']\n",
"\n",
"# Train models for all regions\n",
"for region_path in train_path.iterdir() :\n",
" if region_path.is_dir() and region_path.name not in blacklist :\n",
" train_routine(region_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Ensemble model consisting of pre-trained region models\n",
"class RegionEnsemble(nn.Module) :\n",
" \n",
" def __init__(self, run_path) :\n",
" self.models = []\n",
" # Load all previously trained models\n",
" for region_run_path in run_path.iterdir() :\n",
" if region_run_path.is_dir() :\n",
" model = torch.load(region_run_path / 'model_best.pt')\n",
" self.models.append(model)\n",
" \n",
" def train(self, mode = True) :\n",
" for model in self.models :\n",
" model.train(mode)\n",
" return super().train(mode)\n",
" \n",
" def forward(self, x) :\n",
" # Get region model outputs\n",
" x = torch.stack([model(x) for model in self.models], dim = 1)\n",
" # Convert to probabilities\n",
" x = x.softmax(dim = 2)\n",
" # Average probabilities (confidence-weighted majority vote)\n",
" x = x.mean(dim = 1)\n",
" return x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load ensemble\n",
"ensemble = RegionEnsemble(runs_path / 'best')\n",
"ensemble = ensemble.to(DEVICE)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Calculate binary classification statistics for a test region\n",
"def eval_routine(model, region_path) :\n",
" testset = RegionDataset(region_path, all_labels, transform = transform)\n",
" testloader = DataLoader(testset, batch_size = batch_size, collate_fn = collate_fn, shuffle = False)\n",
" model.train(False)\n",
" \n",
" # Pass through model\n",
" tp = tn = fp = fn = 0\n",
" for samples, labels in testloader :\n",
" samples, labels = samples.to(DEVICE), labels.to(DEVICE)\n",
" out = model(samples)\n",
" pred = out.argmax(dim = 1)\n",
" \n",
" # Calculate true/false positives/negatives\n",
" tp += ((labels == 1) & (pred == 1)).sum().item()\n",
" tn += ((labels == 0) & (pred == 0)).sum().item()\n",
" fp += ((labels == 0) & (pred == 1)).sum().item()\n",
" fn += ((labels == 1) & (pred == 0)).sum().item()\n",
" return tp, tn, fp, fn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"stats = pd.DataFrame(columns = ['tp', 'tn', 'fp', 'fn'])\n",
"\n",
"# Evaluate model on all test regions\n",
"for region_path in test_path.iterdir() :\n",
" if region_path.is_dir() :\n",
" stats.loc[region_path.name] = eval_routine(ensemble, region_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Additional metrics\n",
"def stats_metrics(stats) :\n",
" stats = stats.copy()\n",
" stats['total'] = stats.sum(axis = 1)\n",
" stats['accuracy'] = (stats['tp'] + stats['tn']) / stats['total']\n",
" stats['precision'] = stats['tp'] / (stats['tp'] + stats['fp'])\n",
" stats['recall'] = stats['tp'] / (stats['tp'] + stats['fn'])\n",
" stats['f1 score'] = 2 * stats['precision'] * stats['recall'] / (stats['precision'] + stats['recall'])\n",
" return stats"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Per-region metrics\n",
"metrics = stats_metrics(stats)\n",
"metrics.to_csv(run_path / 'evaluation_per_region.csv')\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Accumulated metrics\n",
"stats_acc = stats.sum(axis = 0).to_frame().transpose()\n",
"metrics = stats_metrics(stats_acc)\n",
"metrics.to_csv(run_path / 'evaluation.csv')\n",
"\n",
"# Plot confusion matrix\n",
"sns.heatmap(\n",
" data = pd.DataFrame(\n",
" metrics[['tn', 'fp', 'fn', 'tp']].to_numpy().reshape((2, 2))\n",
" ).rename_axis('ground-truth', axis = 0).rename_axis('prediction', axis = 1),\n",
" annot = True,\n",
" cbar = False,\n",
" cmap = 'mako_r'\n",
")\n",
"plt.savefig('confusion_matrix.png')\n",
"\n",
"metrics"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:psda]",
"language": "python",
"name": "conda-env-psda-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment