Commit e777529e authored by Oliver Wirth's avatar Oliver Wirth
Browse files

Add training for all regions and ensemble model

parent 47f73752
...@@ -47,10 +47,13 @@ ...@@ -47,10 +47,13 @@
"test_path = data_path / 'test'\n", "test_path = data_path / 'test'\n",
"runs_path = Path('runs')\n", "runs_path = Path('runs')\n",
"\n", "\n",
"# Load labels\n", "# Load labels\n",
"train_labels = pd.read_csv(f'{train_path}_label.csv')\n", "train_labels = pd.read_csv(f'{train_path}_label.csv')\n",
"test_labels = pd.read_csv(f'{test_path}_label.csv')\n",
"\n",
"# Merge train and test labels\n",
"all_labels = train_labels.append(test_labels, ignore_index = True)\n", "all_labels = train_labels.append(test_labels, ignore_index = True)\n",
"all_labels = all_labels.dropna()\n", "all_labels = all_labels.dropna()\n",
"all_labels.ret = all_labels.ret.astype(int)" "all_labels.ret = all_labels.ret.astype(int)"
] ]
}, },
...@@ -62,11 +65,11 @@ ...@@ -62,11 +65,11 @@
"source": [ "source": [
"# Minimal time series length after cleaning\n", "# Minimal time series length after cleaning\n",
"min_series_length = 60\n", "min_series_length = 60\n",
"\n", "\n",
"# Custom dataset per region\n", "# Custom dataset per region\n",
"class RegionDataset(Dataset) :\n", "class RegionDataset(Dataset) :\n",
" \n", " \n",
" def __init__(self, region_path, labels, transform = None) :\n", " def __init__(self, region_path, labels, transform = None) :\n",
" super().__init__()\n", " super().__init__()\n",
" self.region = region_path.name\n", " self.region = region_path.name\n",
" self.transform = transform\n", " self.transform = transform\n",
...@@ -95,11 +98,11 @@ ...@@ -95,11 +98,11 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Preprocessing transformations\n", "# Preprocessing transformations\n",
"# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection\n", "# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection\n",
"transform = transforms.Compose([\n", "transform = transforms.Compose([\n",
" StandardScaler().fit_transform,\n", " StandardScaler().fit_transform,\n",
" torch.FloatTensor\n", " torch.FloatTensor\n",
"])\n", "])\n",
......
%% Cell type:code id: tags:
``` python
from pathlib import Path
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_sequence, pad_sequence
from torchvision import transforms
np.random.seed(42)
pd.set_option("display.max_columns", None)
sns.set_theme()
```
%% Cell type:markdown id: tags:
## Dataset and paths
%% Cell type:code id: tags:
``` python
# Data paths
data_path = Path('data')
train_path = data_path / 'train'
test_path = data_path / 'test'
runs_path = Path('runs')
# Load labels
train_labels = pd.read_csv(f'{train_path}_label.csv')
test_labels = pd.read_csv(f'{test_path}_label.csv')
# Merge train and test labels
all_labels = train_labels.append(test_labels, ignore_index = True)
all_labels = all_labels.dropna()
all_labels.ret = all_labels.ret.astype(int)
```
%% Cell type:code id: tags:
``` python
# Minimal time series length after cleaning
min_series_length = 60
# Custom dataset per region
class RegionDataset(Dataset) :
def __init__(self, region_path, labels, transform = None) :
super().__init__()
self.region = region_path.name
self.transform = transform
self.dfs = []
for csv_path in region_path.iterdir() :
df = pd.read_csv(csv_path)
df = df.dropna()
df = df[(df.T != 0).any()]
label = labels[labels.file_name == csv_path.name].ret.values
if len(df) >= min_series_length and len(label) == 1 :
self.dfs.append((df, label[0]))
def __len__(self) :
return len(self.dfs)
def __getitem__(self, idx) :
df, label = self.dfs[idx]
if self.transform :
df = self.transform(df)
return df, label
```
%% Cell type:markdown id: tags:
## Region model
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
conv1_size = 64
conv1_kernel = 7
conv2_size = 64
conv2_kernel = 7
conv3_size = 64
conv3_kernel = 7
conv_stride = 2
pool_size = 16
hidden_size = 32
output_size = 2
dropout = .4
# Separable convolution layer
class SepConv1d(nn.Module) :
def __init__(self, in_channels, out_channels, kernel_size, stride = 1, padding = 0) :
super().__init__()
self.depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups = in_channels)
self.pointwise_conv = nn.Conv1d(in_channels, out_channels, kernel_size = 1)
def forward(self, x) :
x = self.depthwise_conv(x)
x = self.pointwise_conv(x)
return x
# CNN with separable 1D convolutions
class BNSepCNN(nn.Module) :
def __init__(self) :
super().__init__()
self.conv = nn.Sequential(
SepConv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),
nn.BatchNorm1d(conv1_size),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),
nn.BatchNorm1d(conv2_size),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),
nn.BatchNorm1d(conv3_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.AdaptiveMaxPool1d(pool_size)
)
self.fc = nn.Sequential(
nn.Linear(conv3_size * pool_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
x = x.transpose(1, 2)
x = self.conv(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
```
%% Cell type:markdown id: tags:
## Train models
%% Cell type:code id: tags:
``` python
# Create run directory
run_path = runs_path / str(datetime.datetime.now())
run_path.mkdir()
# CUDA
use_cuda = False
DEVICE = torch.device('cuda:0' if use_cuda and torch.cuda.is_available() else 'cpu')
```
%% Cell type:code id: tags:
``` python
# Forward pass for a single batch
def forward_pass(model, samples, labels, criterion, optimizer = None) :
out = model(samples)
pred = out.argmax(dim = 1)
loss = criterion(out, labels)
if optimizer :
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss = loss.item() * labels.size(0)
correct = (pred == labels).sum().item()
return loss, correct
# Forward pass for whole dataset
def train_loop(model, loader, criterion, optimizer = None) :
model.train(optimizer is not None)
running_loss = 0
running_correct = 0
for samples, labels in loader :
samples, labels = samples.to(DEVICE), labels.to(DEVICE)
loss, correct = forward_pass(model, samples, labels, criterion, optimizer)
running_loss += loss
running_correct += correct
return running_loss, running_correct
```
%% Cell type:code id: tags:
``` python
# Preprocessing transformations
# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection
transform = transforms.Compose([
StandardScaler().fit_transform,
torch.FloatTensor
])
# Crop sequences to same length
def collate_crop(batch) :
samples, labels = zip(*batch)
length = min(x.size(0) for x in samples)
samples = torch.stack([x[-length:] for x in samples])
labels = torch.tensor(labels)
return samples, labels
collate_fn = collate_crop
# Dataset params
holdout = .1
batch_size = 8
# Learning rate and max epochs
lr = 1e-3
epochs = 500
# LR scheduler
scheduler_warmup = 30
break_on_min_lr = True
min_lr = 1e-5
factor = .5
patience = 10
```
%% Cell type:code id: tags:
``` python
# Load region dataset, train and save model
def train_routine(region_path) :
print(f'=== Region {region_path.name} ===')
print('Loading dataset...')
trainset = RegionDataset(region_path, all_labels, transform = transform)
# Split into train and validation set
n_val = int(len(trainset) * holdout)
n_train = len(trainset) - n_val
trainset, valset = random_split(trainset, [n_train, n_val])
trainloader = DataLoader(trainset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)
valloader = DataLoader(valset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)
# Init model and optimizer
model = BNSepCNN()
model = model.to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = factor, patience = patience, verbose = False)
# Loss/accuracy logs
loss_stats = pd.DataFrame(columns = ['train', 'val'])
acc_stats = pd.DataFrame(columns = ['train', 'val'])
# Create model run directory
region_run_path = run_path / region_path.name
region_run_path.mkdir()
with open(region_run_path / 'model_architecture.txt', 'w') as f:
f.write(f"""{model}
{optimizer}
batch_size: {batch_size}
factor: {factor}
patience: {patience}
scheduler_warmup: {scheduler_warmup}""")
# Train model
print('Training model...')
for epoch in range(epochs) :
# Train loop
loss, correct = train_loop(model, trainloader, criterion, optimizer)
train_loss = loss / len(trainset)
train_acc = correct / len(trainset)
# Validation loop
loss, correct = train_loop(model, valloader, criterion)
val_loss = loss / len(valset)
val_acc = correct / len(valset)
# Statistics
loss_stats = loss_stats.append({
'train': train_loss,
'val': val_loss
}, ignore_index = True)
acc_stats = acc_stats.append({
'train': train_acc,
'val': val_acc
}, ignore_index = True)
# Save best model
if loss_stats['val'].idxmin() == len(loss_stats) - 1 :
torch.save(model, region_run_path / 'model_best.pt')
# Schedule learning rate after warmup period
if epoch >= scheduler_warmup :
scheduler.step(val_loss)
current_lr = optimizer.param_groups[0]['lr']
if break_on_min_lr and current_lr < min_lr :
break
torch.save(model, region_run_path / 'model_last.pt')
# Loss/accuracy plot
fig, ax = plt.subplots(1, 2, figsize = (16, 6))
g = sns.lineplot(data = loss_stats, ax = ax[0])
g = sns.lineplot(data = acc_stats, ax = ax[1])
ax[0].set_title('CE loss')
ax[1].set_title('accuracy')
ax[0].set_xlabel('epoch')
ax[1].set_xlabel('epoch')
fig.savefig(region_run_path / 'loss_acc_plot.png')
print(f'best validation acc = {acc_stats["val"].max()}')
```
%% Cell type:code id: tags:
``` python
# Exclude already trained models (from 2021-06-04 14:23:10.737430)
blacklist = ['006', '018', '020', '049', '052', '057', '060', '064']
# Train models for all regions
for region_path in train_path.iterdir() :
if region_path.is_dir() and region_path.name not in blacklist :
train_routine(region_path)
```
%% Cell type:markdown id: tags:
## Evaluation
%% Cell type:code id: tags:
``` python
# Ensemble model consisting of pre-trained region models
class RegionEnsemble(nn.Module) :
def __init__(self, run_path) :
self.models = []
# Load all previously trained models
for region_run_path in run_path.iterdir() :
if region_run_path.is_dir() :
model = torch.load(region_run_path / 'model_best.pt')
self.models.append(model)
def train(self, mode = True) :
for model in self.models :
model.train(mode)
return super().train(mode)
def forward(self, x) :
# Get region model outputs
x = torch.stack([model(x) for model in self.models], dim = 1)
# Convert to probabilities
x = x.softmax(dim = 2)
# Average probabilities (confidence-weighted majority vote)
x = x.mean(dim = 1)
return x
```
%% Cell type:code id: tags:
``` python
# Load ensemble
ensemble = RegionEnsemble(runs_path / 'best')
ensemble = ensemble.to(DEVICE)
```
%% Cell type:code id: tags:
``` python
# Calculate binary classification statistics for a test region
def eval_routine(model, region_path) :
testset = RegionDataset(region_path, all_labels, transform = transform)
testloader = DataLoader(testset, batch_size = batch_size, collate_fn = collate_fn, shuffle = False)
model.train(False)
# Pass through model
tp = tn = fp = fn = 0
for samples, labels in testloader :
samples, labels = samples.to(DEVICE), labels.to(DEVICE)
out = model(samples)
pred = out.argmax(dim = 1)
# Calculate true/false positives/negatives
tp += ((labels == 1) & (pred == 1)).sum().item()
tn += ((labels == 0) & (pred == 0)).sum().item()
fp += ((labels == 0) & (pred == 1)).sum().item()
fn += ((labels == 1) & (pred == 0)).sum().item()
return tp, tn, fp, fn
```
%% Cell type:code id: tags:
``` python
stats = pd.DataFrame(columns = ['tp', 'tn', 'fp', 'fn'])
# Evaluate model on all test regions
for region_path in test_path.iterdir() :
if region_path.is_dir() :
stats.loc[region_path.name] = eval_routine(ensemble, region_path)
```
%% Cell type:code id: tags:
``` python
# Additional metrics
def stats_metrics(stats) :
stats = stats.copy()
stats['total'] = stats.sum(axis = 1)
stats['accuracy'] = (stats['tp'] + stats['tn']) / stats['total']
stats['precision'] = stats['tp'] / (stats['tp'] + stats['fp'])
stats['recall'] = stats['tp'] / (stats['tp'] + stats['fn'])
stats['f1 score'] = 2 * stats['precision'] * stats['recall'] / (stats['precision'] + stats['recall'])
return stats
```
%% Cell type:code id: tags:
``` python
# Per-region metrics
metrics = stats_metrics(stats)
metrics.to_csv(run_path / 'evaluation_per_region.csv')
metrics
```
%% Cell type:code id: tags:
``` python
# Accumulated metrics
stats_acc = stats.sum(axis = 0).to_frame().transpose()
metrics = stats_metrics(stats_acc)
metrics.to_csv(run_path / 'evaluation.csv')
# Plot confusion matrix
sns.heatmap(
data = pd.DataFrame(
metrics[['tn', 'fp', 'fn', 'tp']].to_numpy().reshape((2, 2))
).rename_axis('ground-truth', axis = 0).rename_axis('prediction', axis = 1),
annot = True,
cbar = False,
cmap = 'mako_r'
)
plt.savefig('confusion_matrix.png')
metrics
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment