Commit 13afb2ac authored by uwwdp's avatar uwwdp
Browse files

Cleanup and additional documentation

parent 9e90b432
%% Cell type:code id: tags:
``` python
from pathlib import Path
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_sequence, pad_sequence
from torchvision import transforms
np.random.seed(42)
pd.set_option("display.max_columns", None)
sns.set_theme()
```
%% Cell type:markdown id: tags:
## Load data
%% Cell type:code id: tags:
``` python
# Data paths
data_path = Path('data')
train_path = data_path / 'train'
test_path = data_path / 'test'
runs_path = Path('runs')
# Load labels
train_labels = pd.read_csv(f'{train_path}_label.csv')
test_labels = pd.read_csv(f'{test_path}_label.csv')
# Merge train and test labels
all_labels = train_labels.append(test_labels, ignore_index = True)
all_labels = all_labels.dropna()
all_labels.ret = all_labels.ret.astype(int)
```
%% Cell type:code id: tags:
``` python
# Minimal time series length after cleaning
min_series_length = 60
# Custom dataset per region
class RegionDataset(Dataset) :
def __init__(self, region_path, labels, transform = None) :
super().__init__()
self.region = region_path.name
self.transform = transform
self.dfs = []
for csv_path in region_path.iterdir() :
df = pd.read_csv(csv_path)
df = df.dropna()
df = df[(df.T != 0).any()]
label = labels[labels.file_name == csv_path.name].ret.values
if len(df) >= min_series_length and len(label) == 1 :
self.dfs.append((df, label[0]))
def __len__(self) :
return len(self.dfs)
def __getitem__(self, idx) :
df, label = self.dfs[idx]
if self.transform :
df = self.transform(df)
return df, label
```
%% Cell type:markdown id: tags:
Pre-processing transformations.
%% Cell type:code id: tags:
``` python
# Preprocessing transformations
# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection
transform = transforms.Compose([
StandardScaler().fit_transform,
torch.FloatTensor
])
# Load train and test set
trainset = RegionDataset(train_path / '004', all_labels, transform = transform)
testset = RegionDataset(test_path / 'dummy', all_labels, transform = transform)
```
%% Cell type:markdown id: tags:
Data split and batching hyperparameters.
Three different collate-functions are present:
- `collate_pack` packs timeseries in to a PyTorch `PackedSequence` object, which allows masking in recurrent networks
- `collate_crop` crops timeseries to the length of the shortest series in the batch, data points are removed from the front as later data points likely contribute more information toward the classification goal
- `collate_pad` pads sequences to the length of the longest sequence in the batch
Experiments showed that `collate_crop` seems to work best for CNNs.
%% Cell type:code id: tags:
``` python
# Split into train and validation set
holdout = .2
n_val = int(len(trainset) * holdout)
n_train = len(trainset) - n_val
trainset, valset = random_split(trainset, [n_train, n_val])
# Pack variable sized sequences for RNN
def collate_pack(batch) :
samples, labels = zip(*batch)
samples = pack_sequence(samples, enforce_sorted = False)
labels = torch.tensor(labels)
return samples, labels
# Crop sequences to same length
def collate_crop(batch) :
samples, labels = zip(*batch)
length = min(x.size(0) for x in samples)
samples = torch.stack([x[-length:] for x in samples])
labels = torch.tensor(labels)
return samples, labels
# Pad sequences to same length
# TODO masking?
def collate_pad(batch) :
samples, labels = zip(*batch)
samples = pad_sequence([x.flip((0,)) for x in samples], batch_first = True).flip((1,))
labels = torch.tensor(labels)
return samples, labels
# Create data loader
batch_size = 8
collate_fn = collate_crop
trainloader = DataLoader(trainset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)
testloader = DataLoader(testset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)
valloader = DataLoader(valset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)
```
%% Cell type:markdown id: tags:
## Models
Various RNN and CNN models, starting with a LSTM-based RNN.
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
lstm_size = 32
hidden_size = 16
output_size = 2
dropout = .5
# LSTM-based NN
class LSTMRNN(nn.Module) :
def __init__(self) :
super().__init__()
self.rec = nn.LSTM(input_size = input_size, hidden_size = lstm_size, batch_first = True)
self.clf = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(lstm_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
_, (x, _) = self.rec(x)
x = x.squeeze(dim = 0)
x = self.clf(x)
return x
```
%% Cell type:markdown id: tags:
Similar to above network, but with a GRU instead of LSTM.
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
rec_size = 32
hidden_size = 16
output_size = 2
dropout = .5
# GRU-based NN
class GRURNN(nn.Module) :
def __init__(self) :
super().__init__()
self.rec = nn.GRU(input_size = input_size, hidden_size = rec_size, batch_first = True)
self.clf = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(rec_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
_, x = self.rec(x)
x = x.squeeze(dim = 0)
x = self.clf(x)
return x
```
%% Cell type:markdown id: tags:
One-dimensional CNN with three convolutional layers.
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
conv1_size = 64
conv1_kernel = 7
conv2_size = 64
conv2_kernel = 7
conv3_size = 64
conv3_kernel = 7
conv_stride = 2
pool_size = 16
hidden_size = 32
output_size = 2
dropout = .5
# 1D convolutional NN
class CNN(nn.Module) :
def __init__(self) :
super().__init__()
self.conv = nn.Sequential(
nn.Conv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),
nn.ReLU(),
nn.Dropout(dropout),
nn.Conv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),
nn.ReLU(),
nn.Dropout(dropout),
nn.Conv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),
nn.ReLU(),
nn.Dropout(dropout),
nn.AdaptiveMaxPool1d(pool_size)
)
self.fc = nn.Sequential(
nn.Linear(conv3_size * pool_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
x = x.transpose(1, 2)
x = self.conv(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
```
%% Cell type:markdown id: tags:
Another CNN with depth-wise separable convolutions, to reduce the amount of parameters, while maintaining classification performance.
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
conv1_size = 64
conv1_kernel = 7
conv2_size = 64
conv2_kernel = 7
conv3_size = 64
conv3_kernel = 7
conv_stride = 2
pool_size = 16
hidden_size = 32
output_size = 2
dropout = .5
# Separable convolution layer
class SepConv1d(nn.Module) :
def __init__(self, in_channels, out_channels, kernel_size, stride = 1, padding = 0) :
super().__init__()
self.depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups = in_channels)
self.pointwise_conv = nn.Conv1d(in_channels, out_channels, kernel_size = 1)
def forward(self, x) :
x = self.depthwise_conv(x)
x = self.pointwise_conv(x)
return x
# CNN with separable 1D convolutions
class SepCNN(nn.Module) :
def __init__(self) :
super().__init__()
self.conv = nn.Sequential(
SepConv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),
nn.ReLU(),
nn.Dropout(dropout),
nn.AdaptiveMaxPool1d(pool_size)
)
self.fc = nn.Sequential(
nn.Linear(conv3_size * pool_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
x = x.transpose(1, 2)
x = self.conv(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
```
%% Cell type:markdown id: tags:
Additional regularization in the form of batch normalization. This and the previous model proved to be best-performing on single training regions.
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
conv1_size = 64
conv1_kernel = 7
conv2_size = 64
conv2_kernel = 7
conv3_size = 64
conv3_kernel = 7
conv_stride = 2
pool_size = 16
hidden_size = 32
output_size = 2
dropout = .4
# CNN with separable 1D convolutions
class BNSepCNN(nn.Module) :
def __init__(self) :
super().__init__()
self.conv = nn.Sequential(
SepConv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),
nn.BatchNorm1d(conv1_size),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),
nn.BatchNorm1d(conv2_size),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),
nn.BatchNorm1d(conv3_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.AdaptiveMaxPool1d(pool_size)
)
self.fc = nn.Sequential(
nn.Linear(conv3_size * pool_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
x = x.transpose(1, 2)
x = self.conv(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
```
%% Cell type:markdown id: tags:
## Training parameters
%% Cell type:code id: tags:
``` python
# Learning rate, starting epoch and max epochs
lr = 1e-3
epoch = 0
epochs = 500
# Init model and optimizer
model = BNSepCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)
# LR scheduler
scheduler_warmup = 30
break_on_min_lr = True
min_lr = 1e-5
factor = .5
patience = 10
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = factor, patience = patience, verbose = True)
# Loss/accuracy logs
loss_stats = pd.DataFrame(columns = ['train', 'val'])
acc_stats = pd.DataFrame(columns = ['train', 'val'])
# CUDA
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(DEVICE)
# Model info
def prod(xs) :
return 1 if len(xs) == 0 else xs[0] * prod(xs[1:])
n_parameters = sum(prod(p.size()) for p in model.parameters())
print(model)
print(f'The model \'{model.__class__.__name__}\' has {n_parameters} parameters')
```
%% Cell type:code id: tags:
``` python
# Create run directory
run_path = runs_path / str(datetime.datetime.now())
run_path.mkdir()
with open(run_path / 'model_architecture.txt', 'w') as f:
f.write(f"""{model}
{optimizer}
batch_size: {batch_size}
factor: {factor}
patience: {patience}
scheduler_warmup: {scheduler_warmup}""")
```
%% Cell type:code id: tags:
``` python
# Forward pass for a single batch
def forward_pass(model, samples, labels, criterion, optimizer = None) :
out = model(samples)
pred = out.argmax(dim = 1)
loss = criterion(out, labels)
if optimizer :
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss = loss.item() * labels.size(0)
correct = (pred == labels).sum().item()
return loss, correct
# Forward pass for whole dataset
def train_loop(model, loader, criterion, optimizer = None) :
model.train(optimizer is not None)
running_loss = 0
running_correct = 0
for samples, labels in loader :
samples, labels = samples.to(DEVICE), labels.to(DEVICE)
loss, correct = forward_pass(model, samples, labels, criterion, optimizer)
running_loss += loss
running_correct += correct
return running_loss, running_correct
```
%% Cell type:markdown id: tags:
## Train model
Train model for the specified number of epochs, or when the learning rate falls below a certain point (by the scheduler).
%% Cell type:code id: tags:
``` python
# Train model
for epoch in range(epoch, epoch + epochs) :
print(f'=== Epoch {(epoch + 1):3} ===')
# Train loop
loss, correct = train_loop(model, trainloader, criterion, optimizer)
train_loss = loss / len(trainset)
train_acc = correct / len(trainset)
print(f' training loss = {train_loss:.4f}, acc = {train_acc:.4f}')
# Validation loop
loss, correct = train_loop(model, valloader, criterion)
val_loss = loss / len(valset)
val_acc = correct / len(valset)
print(f'validation loss = {val_loss:.4f}, acc = {val_acc:.4f}')
# Statistics
loss_stats = loss_stats.append({
'train': train_loss,
'val': val_loss
}, ignore_index = True)
acc_stats = acc_stats.append({
'train': train_acc,
'val': val_acc
}, ignore_index = True)
# Save best model
if loss_stats['val'].idxmin() == len(loss_stats) - 1 :
torch.save(model, run_path / f'model_best.pt')
# Schedule learning rate after warmup period
if epoch >= scheduler_warmup :
scheduler.step(val_loss)
current_lr = optimizer.param_groups[0]['lr']
if break_on_min_lr and current_lr < min_lr :
break
torch.save(model, run_path / f'model_last.pt')
epoch += 1
```
%% Cell type:code id: tags:
``` python
# Plot train/val loss/accuracy
fig, ax = plt.subplots(1, 2, figsize = (16, 6))
g = sns.lineplot(data = loss_stats, ax = ax[0])
g = sns.lineplot(data = acc_stats, ax = ax[1])
ax[0].set_title('CE loss')
ax[1].set_title('accuracy')
ax[0].set_xlabel('epoch')
ax[1].set_xlabel('epoch')
fig.savefig(run_path / 'loss_acc_plot.png')
```
%% Cell type:markdown id: tags:
## Evaluation
%% Cell type:code id: tags:
``` python
# Evaluate trained model on test set
running_loss, running_correct = train_loop(model, testloader, criterion)
n = len(testset)
print(f'Evaluation after {epoch} epochs: loss = {(running_loss / n):.4f}, acc = {(running_correct / n):.4f}')
```
......
%% Cell type:code id: tags:
``` python
from pathlib import Path
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_sequence, pad_sequence
from torchvision import transforms
np.random.seed(42)
pd.set_option("display.max_columns", None)
sns.set_theme()
```
%% Cell type:markdown id: tags:
## Dataset and paths
%% Cell type:code id: tags:
``` python
# Data paths
data_path = Path('data')
train_path = data_path / 'train'
test_path = data_path / 'test'
runs_path = Path('runs')
# Load labels
train_labels = pd.read_csv(f'{train_path}_label.csv')
test_labels = pd.read_csv(f'{test_path}_label.csv')
# Merge train and test labels
all_labels = train_labels.append(test_labels, ignore_index = True)
all_labels = all_labels.dropna()
all_labels.ret = all_labels.ret.astype(int)
```
%% Cell type:code id: tags:
``` python
# Minimal time series length after cleaning
min_series_length = 60
# Custom dataset per region
class RegionDataset(Dataset) :
def __init__(self, region_path, labels, transform = None) :
super().__init__()
self.region = region_path.name
self.transform = transform
self.dfs = []
for csv_path in region_path.iterdir() :
df = pd.read_csv(csv_path)
df = df.dropna()
df = df[(df.T != 0).any()]
label = labels[labels.file_name == csv_path.name].ret.values
if len(df) >= min_series_length and len(label) == 1 :
self.dfs.append((df, label[0]))
def __len__(self) :
return len(self.dfs)
def __getitem__(self, idx) :
df, label = self.dfs[idx]
if self.transform :
df = self.transform(df)
return df, label
```
%% Cell type:markdown id: tags:
## Region model
CNN model trained on timeseries from a single region.
Employing depth-wise separable convolutions for reducing number of parameters and dropout, batch normalization for regularization.
%% Cell type:code id: tags:
``` python
# NN hyperparameters
input_size = 75
conv1_size = 64
conv1_kernel = 7
conv2_size = 64
conv2_kernel = 7
conv3_size = 64
conv3_kernel = 7
conv_stride = 2
pool_size = 16
hidden_size = 32
output_size = 2
dropout = .4
# Separable convolution layer
class SepConv1d(nn.Module) :
def __init__(self, in_channels, out_channels, kernel_size, stride = 1, padding = 0) :
super().__init__()
self.depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups = in_channels)
self.pointwise_conv = nn.Conv1d(in_channels, out_channels, kernel_size = 1)
def forward(self, x) :
x = self.depthwise_conv(x)
x = self.pointwise_conv(x)
return x
# CNN with separable 1D convolutions
class BNSepCNN(nn.Module) :
def __init__(self) :
super().__init__()
self.conv = nn.Sequential(
SepConv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),
nn.BatchNorm1d(conv1_size),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),
nn.BatchNorm1d(conv2_size),
nn.ReLU(),
nn.Dropout(dropout),
SepConv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),
nn.BatchNorm1d(conv3_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.AdaptiveMaxPool1d(pool_size)
)
self.fc = nn.Sequential(
nn.Linear(conv3_size * pool_size, hidden_size),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_size, output_size)
)
def forward(self, x) :
x = x.transpose(1, 2)
x = self.conv(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
```
%% Cell type:markdown id: tags:
## Train models
%% Cell type:code id: tags:
``` python
# Create run directory
run_path = runs_path / str(datetime.datetime.now())
run_path.mkdir()
# CUDA
use_cuda = False
DEVICE = torch.device('cuda:0' if use_cuda and torch.cuda.is_available() else 'cpu')
```
%% Cell type:code id: tags:
``` python
# Forward pass for a single batch
def forward_pass(model, samples, labels, criterion, optimizer = None) :
out = model(samples)
pred = out.argmax(dim = 1)
loss = criterion(out, labels)
if optimizer :
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss = loss.item() * labels.size(0)
correct = (pred == labels).sum().item()
return loss, correct
# Forward pass for whole dataset
def train_loop(model, loader, criterion, optimizer = None) :
model.train(optimizer is not None)
running_loss = 0
running_correct = 0
for samples, labels in loader :
samples, labels = samples.to(DEVICE), labels.to(DEVICE)
loss, correct = forward_pass(model, samples, labels, criterion, optimizer)
running_loss += loss
running_correct += correct
return running_loss, running_correct
```
%% Cell type:markdown id: tags:
Define pre-processing operations and training hyperparameters.
%% Cell type:code id: tags:
``` python
# Preprocessing transformations
# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection
transform = transforms.Compose([
StandardScaler().fit_transform,
torch.FloatTensor
])
# Crop sequences to same length
def collate_crop(batch) :
samples, labels = zip(*batch)
length = min(x.size(0) for x in samples)
samples = torch.stack([x[-length:] for x in samples])
labels = torch.tensor(labels)
return samples, labels
collate_fn = collate_crop
# Dataset params
holdout = .1
batch_size = 8
# Learning rate and max epochs
lr = 1e-3
epochs = 500
# LR scheduler
scheduler_warmup = 30
break_on_min_lr = True
min_lr = 1e-5
factor = .5
patience = 10
```
%% Cell type:markdown id: tags:
Routine for loading a region's dataset, and training a model until convergence.
%% Cell type:code id: tags:
``` python
# Load region dataset, train and save model
def train_routine(region_path) :
print(f'=== Region {region_path.name} ===')
print('Loading dataset...')
trainset = RegionDataset(region_path, all_labels, transform = transform)
# Split into train and validation set
n_val = int(len(trainset) * holdout)
n_train = len(trainset) - n_val
trainset, valset = random_split(trainset, [n_train, n_val])
trainloader = DataLoader(trainset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)
valloader = DataLoader(valset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)
# Init model and optimizer
model = BNSepCNN()
model = model.to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = factor, patience = patience, verbose = False)
# Loss/accuracy logs
loss_stats = pd.DataFrame(columns = ['train', 'val'])
acc_stats = pd.DataFrame(columns = ['train', 'val'])
# Create model run directory
region_run_path = run_path / region_path.name
region_run_path.mkdir()
with open(region_run_path / 'model_architecture.txt', 'w') as f:
f.write(f"""{model}
{optimizer}
batch_size: {batch_size}
factor: {factor}
patience: {patience}
scheduler_warmup: {scheduler_warmup}""")
# Train model
print('Training model...')
for epoch in range(epochs) :
# Train loop
loss, correct = train_loop(model, trainloader, criterion, optimizer)
train_loss = loss / len(trainset)
train_acc = correct / len(trainset)
# Validation loop
loss, correct = train_loop(model, valloader, criterion)
val_loss = loss / len(valset)
val_acc = correct / len(valset)
# Statistics
loss_stats = loss_stats.append({
'train': train_loss,
'val': val_loss
}, ignore_index = True)
acc_stats = acc_stats.append({
'train': train_acc,
'val': val_acc
}, ignore_index = True)
# Save best model
if loss_stats['val'].idxmin() == len(loss_stats) - 1 :
torch.save(model, region_run_path / 'model_best.pt')
# Schedule learning rate after warmup period
if epoch >= scheduler_warmup :
scheduler.step(val_loss)
current_lr = optimizer.param_groups[0]['lr']
if break_on_min_lr and current_lr < min_lr :
break
torch.save(model, region_run_path / 'model_last.pt')
# Loss/accuracy plot
fig, ax = plt.subplots(1, 2, figsize = (16, 6))
g = sns.lineplot(data = loss_stats, ax = ax[0])
g = sns.lineplot(data = acc_stats, ax = ax[1])
ax[0].set_title('CE loss')