Commit 37089085 authored by anastasiaslobodyanik's avatar anastasiaslobodyanik
Browse files

Merge branch 'master' of git.scc.kit.edu:oliver.wirth/windturbineprediction

parents 4ab1a670 20201f95
.ipynb_checkpoints
.train
.test
\ No newline at end of file
......@@ -63,6 +63,9 @@
"metadata": {},
"outputs": [],
"source": [
"# Minimal time series length after cleaning\n",
"min_series_length = 60\n",
"\n",
"# Custom dataset per region\n",
"class RegionDataset(Dataset) :\n",
" \n",
......@@ -78,7 +81,7 @@
" df = df[(df.T != 0).any()]\n",
" \n",
" label = labels[labels.file_name == csv_path.name].ret.values\n",
" if df.shape[0] > 0 and len(label) == 1 :\n",
" if len(df) >= min_series_length and len(label) == 1 :\n",
" self.dfs.append((df, label[0]))\n",
" \n",
" def __len__(self) :\n",
......@@ -91,6 +94,13 @@
" return df, label"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Pre-processing transformations."
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -105,8 +115,20 @@
"])\n",
"\n",
"# Load train and test set\n",
"trainset = RegionDataset(train_path / '004', all_labels, transform = transform)\n",
"testset = RegionDataset(test_path / 'dummy', all_labels, transform = transform)"
"trainset = RegionDataset(train_path / '004', all_labels, transform = transform)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data split and batching hyperparameters.\n",
"Three different collate-functions are present:\n",
"- `collate_pack` packs timeseries in to a PyTorch `PackedSequence` object, which allows masking in recurrent networks\n",
"- `collate_crop` crops timeseries to the length of the shortest series in the batch, data points are removed from the front as later data points likely contribute more information toward the classification goal\n",
"- `collate_pad` pads sequences to the length of the longest sequence in the batch\n",
"\n",
"Experiments showed that `collate_crop` seems to work best for CNNs."
]
},
{
......@@ -117,7 +139,7 @@
"source": [
"# Split into train and validation set\n",
"holdout = .2\n",
"n_val = int(len(trainset) * .2)\n",
"n_val = int(len(trainset) * holdout)\n",
"n_train = len(trainset) - n_val\n",
"trainset, valset = random_split(trainset, [n_train, n_val])\n",
"\n",
......@@ -148,7 +170,6 @@
"batch_size = 8\n",
"collate_fn = collate_crop\n",
"trainloader = DataLoader(trainset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)\n",
"testloader = DataLoader(testset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)\n",
"valloader = DataLoader(valset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)"
]
},
......@@ -156,7 +177,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Models"
"## Models\n",
"\n",
"Various RNN and CNN models, starting with a LSTM-based RNN."
]
},
{
......@@ -193,6 +216,13 @@
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Similar to above network, but with a GRU instead of LSTM."
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -227,6 +257,13 @@
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One-dimensional CNN with three convolutional layers."
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -279,6 +316,13 @@
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Another CNN with depth-wise separable convolutions, to reduce the amount of parameters, while maintaining classification performance."
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -344,6 +388,13 @@
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Additional regularization in the form of batch normalization. This and the previous model proved to be best-performing on single training regions."
]
},
{
"cell_type": "code",
"execution_count": null,
......@@ -505,7 +556,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train model"
"## Train model\n",
"\n",
"Train model for the specified number of epochs, or when the learning rate falls below a certain point (by the scheduler)."
]
},
{
......@@ -561,6 +614,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Plot train/val loss/accuracy\n",
"fig, ax = plt.subplots(1, 2, figsize = (16, 6))\n",
"g = sns.lineplot(data = loss_stats, ax = ax[0])\n",
"g = sns.lineplot(data = acc_stats, ax = ax[1])\n",
......@@ -572,25 +626,6 @@
"\n",
"fig.savefig(run_path / 'loss_acc_plot.png')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Evaluate trained model on test set\n",
"running_loss, running_correct = train_loop(model, testloader, criterion)\n",
"n = len(testset)\n",
"print(f'Evaluation after {epoch} epochs: loss = {(running_loss / n):.4f}, acc = {(running_correct / n):.4f}')"
]
}
],
"metadata": {
......
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import datetime\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from tqdm import tqdm\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"import torch\n",
"from torch import nn, optim\n",
"from torch.utils.data import Dataset, DataLoader\n",
"from torch.utils.data.dataset import random_split\n",
"from torch.nn.utils.rnn import pack_sequence, pad_sequence\n",
"from torchvision import transforms\n",
"\n",
"np.random.seed(42)\n",
"pd.set_option(\"display.max_columns\", None)\n",
"sns.set_theme()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset and paths"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Data paths\n",
"data_path = Path('data')\n",
"train_path = data_path / 'train'\n",
"test_path = data_path / 'test'\n",
"runs_path = Path('runs')\n",
"\n",
"# Load labels\n",
"train_labels = pd.read_csv(f'{train_path}_label.csv')\n",
"test_labels = pd.read_csv(f'{test_path}_label.csv')\n",
"\n",
"# Merge train and test labels\n",
"all_labels = train_labels.append(test_labels, ignore_index = True)\n",
"all_labels = all_labels.dropna()\n",
"all_labels.ret = all_labels.ret.astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Minimal time series length after cleaning\n",
"min_series_length = 60\n",
"\n",
"# Custom dataset per region\n",
"class RegionDataset(Dataset) :\n",
" \n",
" def __init__(self, region_path, labels, transform = None) :\n",
" super().__init__()\n",
" self.region = region_path.name\n",
" self.transform = transform\n",
" self.dfs = []\n",
" \n",
" for csv_path in region_path.iterdir() :\n",
" df = pd.read_csv(csv_path)\n",
" df = df.dropna()\n",
" df = df[(df.T != 0).any()]\n",
" \n",
" label = labels[labels.file_name == csv_path.name].ret.values\n",
" if len(df) >= min_series_length and len(label) == 1 :\n",
" self.dfs.append((df, label[0]))\n",
" \n",
" def __len__(self) :\n",
" return len(self.dfs)\n",
" \n",
" def __getitem__(self, idx) :\n",
" df, label = self.dfs[idx]\n",
" if self.transform :\n",
" df = self.transform(df)\n",
" return df, label"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Region model\n",
"\n",
"CNN model trained on timeseries from a single region.\n",
"Employing depth-wise separable convolutions for reducing number of parameters and dropout, batch normalization for regularization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# NN hyperparameters\n",
"input_size = 75\n",
"conv1_size = 64\n",
"conv1_kernel = 7\n",
"conv2_size = 64\n",
"conv2_kernel = 7\n",
"conv3_size = 64\n",
"conv3_kernel = 7\n",
"conv_stride = 2\n",
"pool_size = 16\n",
"hidden_size = 32\n",
"output_size = 2\n",
"dropout = .4\n",
"\n",
"# Separable convolution layer\n",
"class SepConv1d(nn.Module) :\n",
" \n",
" def __init__(self, in_channels, out_channels, kernel_size, stride = 1, padding = 0) :\n",
" super().__init__()\n",
" self.depthwise_conv = nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups = in_channels)\n",
" self.pointwise_conv = nn.Conv1d(in_channels, out_channels, kernel_size = 1)\n",
" \n",
" def forward(self, x) :\n",
" x = self.depthwise_conv(x)\n",
" x = self.pointwise_conv(x)\n",
" return x\n",
"\n",
"# CNN with separable 1D convolutions\n",
"class BNSepCNN(nn.Module) :\n",
" \n",
" def __init__(self) :\n",
" super().__init__()\n",
" self.conv = nn.Sequential(\n",
" SepConv1d(input_size, conv1_size, kernel_size = conv1_kernel, stride = conv_stride),\n",
" nn.BatchNorm1d(conv1_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" SepConv1d(conv1_size, conv2_size, kernel_size = conv2_kernel, stride = conv_stride),\n",
" nn.BatchNorm1d(conv2_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" SepConv1d(conv2_size, conv3_size, kernel_size = conv3_kernel, stride = conv_stride),\n",
" nn.BatchNorm1d(conv3_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" nn.AdaptiveMaxPool1d(pool_size)\n",
" )\n",
" self.fc = nn.Sequential(\n",
" nn.Linear(conv3_size * pool_size, hidden_size),\n",
" nn.ReLU(),\n",
" nn.Dropout(dropout),\n",
" nn.Linear(hidden_size, output_size)\n",
" )\n",
" \n",
" def forward(self, x) :\n",
" x = x.transpose(1, 2)\n",
" x = self.conv(x)\n",
" x = x.view(x.size(0), -1)\n",
" x = self.fc(x)\n",
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train models"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create run directory\n",
"run_path = runs_path / str(datetime.datetime.now())\n",
"run_path.mkdir()\n",
"\n",
"# CUDA\n",
"use_cuda = False\n",
"DEVICE = torch.device('cuda:0' if use_cuda and torch.cuda.is_available() else 'cpu')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Forward pass for a single batch\n",
"def forward_pass(model, samples, labels, criterion, optimizer = None) :\n",
" out = model(samples)\n",
" pred = out.argmax(dim = 1)\n",
" loss = criterion(out, labels)\n",
"\n",
" if optimizer :\n",
" optimizer.zero_grad()\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" loss = loss.item() * labels.size(0)\n",
" correct = (pred == labels).sum().item()\n",
" return loss, correct\n",
"\n",
"# Forward pass for whole dataset\n",
"def train_loop(model, loader, criterion, optimizer = None) :\n",
" model.train(optimizer is not None)\n",
" running_loss = 0\n",
" running_correct = 0\n",
" \n",
" for samples, labels in loader :\n",
" samples, labels = samples.to(DEVICE), labels.to(DEVICE)\n",
" loss, correct = forward_pass(model, samples, labels, criterion, optimizer)\n",
" running_loss += loss\n",
" running_correct += correct\n",
" \n",
" return running_loss, running_correct"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define pre-processing operations and training hyperparameters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Preprocessing transformations\n",
"# TODO: PCA, truncated SVD, MinMaxScaling, scale over whole dataset, features selection\n",
"transform = transforms.Compose([\n",
" StandardScaler().fit_transform,\n",
" torch.FloatTensor\n",
"])\n",
"\n",
"# Crop sequences to same length\n",
"def collate_crop(batch) :\n",
" samples, labels = zip(*batch)\n",
" length = min(x.size(0) for x in samples)\n",
" samples = torch.stack([x[-length:] for x in samples])\n",
" labels = torch.tensor(labels)\n",
" return samples, labels\n",
"collate_fn = collate_crop\n",
"\n",
"# Dataset params\n",
"holdout = .1\n",
"batch_size = 8\n",
"\n",
"# Learning rate and max epochs\n",
"lr = 1e-3\n",
"epochs = 500\n",
"\n",
"# LR scheduler\n",
"scheduler_warmup = 30\n",
"break_on_min_lr = True\n",
"min_lr = 1e-5\n",
"factor = .5\n",
"patience = 10"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Routine for loading a region's dataset, and training a model until convergence."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load region dataset, train and save model\n",
"def train_routine(region_path) :\n",
" print(f'=== Region {region_path.name} ===')\n",
" print('Loading dataset...')\n",
" trainset = RegionDataset(region_path, all_labels, transform = transform)\n",
" \n",
" # Split into train and validation set\n",
" n_val = int(len(trainset) * holdout)\n",
" n_train = len(trainset) - n_val\n",
" trainset, valset = random_split(trainset, [n_train, n_val])\n",
" trainloader = DataLoader(trainset, batch_size = batch_size, collate_fn = collate_fn, shuffle = True)\n",
" valloader = DataLoader(valset , batch_size = batch_size, collate_fn = collate_fn, shuffle = False)\n",
" \n",
" # Init model and optimizer\n",
" model = BNSepCNN()\n",
" model = model.to(DEVICE)\n",
" criterion = nn.CrossEntropyLoss()\n",
" optimizer = optim.Adam(model.parameters(), lr = lr)\n",
" scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = factor, patience = patience, verbose = False)\n",
" \n",
" # Loss/accuracy logs\n",
" loss_stats = pd.DataFrame(columns = ['train', 'val'])\n",
" acc_stats = pd.DataFrame(columns = ['train', 'val'])\n",
" \n",
" # Create model run directory\n",
" region_run_path = run_path / region_path.name\n",
" region_run_path.mkdir()\n",
" with open(region_run_path / 'model_architecture.txt', 'w') as f:\n",
" f.write(f\"\"\"{model}\n",
"\n",
" {optimizer}\n",
" batch_size: {batch_size}\n",
" factor: {factor}\n",
" patience: {patience}\n",
" scheduler_warmup: {scheduler_warmup}\"\"\")\n",
" \n",
" # Train model\n",
" print('Training model...')\n",
" for epoch in range(epochs) :\n",
" # Train loop\n",
" loss, correct = train_loop(model, trainloader, criterion, optimizer)\n",
" train_loss = loss / len(trainset)\n",
" train_acc = correct / len(trainset)\n",
" \n",
" # Validation loop\n",
" loss, correct = train_loop(model, valloader, criterion)\n",
" val_loss = loss / len(valset)\n",
" val_acc = correct / len(valset)\n",
"\n",
" # Statistics\n",
" loss_stats = loss_stats.append({\n",
" 'train': train_loss,\n",
" 'val': val_loss\n",
" }, ignore_index = True)\n",
" acc_stats = acc_stats.append({\n",
" 'train': train_acc,\n",
" 'val': val_acc\n",
" }, ignore_index = True)\n",
"\n",
" # Save best model\n",
" if loss_stats['val'].idxmin() == len(loss_stats) - 1 :\n",
" torch.save(model, region_run_path / 'model_best.pt')\n",
"\n",
" # Schedule learning rate after warmup period\n",
" if epoch >= scheduler_warmup :\n",
" scheduler.step(val_loss)\n",
" current_lr = optimizer.param_groups[0]['lr']\n",
" if break_on_min_lr and current_lr < min_lr :\n",
" break\n",
" torch.save(model, region_run_path / 'model_last.pt')\n",
" \n",
" # Loss/accuracy plot\n",
" fig, ax = plt.subplots(1, 2, figsize = (16, 6))\n",
" g = sns.lineplot(data = loss_stats, ax = ax[0])\n",
" g = sns.lineplot(data = acc_stats, ax = ax[1])\n",
" ax[0].set_title('CE loss')\n",
" ax[1].set_title('accuracy')\n",
" ax[0].set_xlabel('epoch')\n",
" ax[1].set_xlabel('epoch')\n",
" fig.savefig(region_run_path / 'loss_acc_plot.png')\n",
" print(f'best validation acc = {acc_stats[\"val\"].max()}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Blacklist already trained regions to save time."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Exclude already trained models (from 2021-06-04 14:23:10.737430, 2021-06-05 11:33:17.875281)\n",
"# blacklist = ['004', '011', '037', '006', '017', '018', '020', '029', '049', '052', '055', '057', '060', '064']\n",
"blacklist = []\n",
"\n",
"# Train models for all regions\n",
"for region_path in train_path.iterdir() :\n",
" if region_path.is_dir() and region_path.name not in blacklist :\n",
" train_routine(region_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation\n",
"\n",
"Evaluate trained models on test regions, by combining the models into an ensemble."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Ensemble model consisting of pre-trained region models\n",
"class RegionEnsemble(nn.Module) :\n",
" \n",
" def __init__(self, run_path) :\n",
" super().__init__()\n",
" self.models = nn.ModuleDict()\n",
" # Load all previously trained models\n",
" for region_run_path in run_path.iterdir() :\n",
" if region_run_path.is_dir() :\n",
" model = torch.load(region_run_path / 'model_best.pt')\n",
" self.models[region_run_path.name] = model\n",
" \n",
" def forward(self, x) :\n",
" # Get region model outputs\n",
" x = torch.stack([model(x) for model in self.models.values()], dim = 1)\n",
" # Convert to probabilities\n",
" x = x.softmax(dim = 2)\n",
" # Average probabilities (confidence-weighted majority vote)\n",
" x = x.mean(dim = 1)\n",
" return x"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Adjust `run_path` if necessary."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load ensemble\n",
"# run_path = runs_path / 'best'\n",
"ensemble = RegionEnsemble(run_path)\n",
"ensemble = ensemble.to(DEVICE)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Evaluation routine analogous to the training routine."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Calculate binary classification statistics for a test region\n",
"def eval_routine(model, region_path) :\n",
" print(f'=== Region {region_path.name} ===')\n",
" print('Loading dataset...')\n",
" testset = RegionDataset(region_path, all_labels, transform = transform)\n",
" testloader = DataLoader(testset, batch_size = batch_size, collate_fn = collate_fn, shuffle = False)\n",
" model.train(False)\n",
" \n",
" # Pass through model\n",
" print('Evaluating model...')\n",
" tp = tn = fp = fn = 0\n",
" for samples, labels in testloader :\n",
" samples, labels = samples.to(DEVICE), labels.to(DEVICE)\n",
" out = model(samples)\n",
" pred = out.argmax(dim = 1)\n",