Commit 4f821b92 authored by PauTheu's avatar PauTheu
Browse files
parents 76a8859d 37089085
This source diff could not be displayed because it is too large. You can view the blob instead.
%% Cell type:code id:6b0d97d2 tags:
``` python
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import trange
```
%% Cell type:markdown id:02c7a9b0 tags:
# Windturbineausfall Vorhersage
%% Cell type:code id:7911fb2a tags:
``` python
def prep_data(data):
# split data from labels
X,y = data.drop(columns=['label']), data.label
categorical = pd.DataFrame()
categorical_embedding_sizes = []
numerical = pd.DataFrame()
for column in X.columns:
# transform categorical data into numerical vector
if column == 'region':
X[column].fillna('not specified', inplace=True)
# transform text into categories
c = X[column].astype('category')
# add numerical code of categories to dataset
categorical[column] = c.cat.codes.values
# calculate number of embeddings for categorical input, max 50
categorical_embedding_sizes.append((len(c.cat.categories),
min(50, (len(c.cat.categories) + 1) // 2)))
else:
numerical[column] = X[column]
return [torch.tensor(numerical.to_numpy(), dtype=torch.float),
torch.tensor(categorical.to_numpy(), dtype=torch.int64),
categorical_embedding_sizes,
torch.tensor(y.values, dtype=torch.int64).flatten()]
```
%% Cell type:code id:5c15a7f7 tags:
``` python
class ClassificationDataSet(Dataset):
def __init__(self, dataframe):
numerical_data, self.categorical_data, self.embeddings, self.labels = prep_data(data=dataframe)
#numerical_data, self.labels = prep_data(data=dataframe)
self.scaler = MinMaxScaler()
self.numerical_data = torch.tensor(self.scaler.fit_transform(numerical_data), dtype=torch.float)
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
return {'numerical': self.numerical_data[idx],
'categorical': self.categorical_data[idx],
'label': self.labels[idx]}
#return torch.tensor(self.numerical_data[idx], self.categorical_data[idx]), self.labels[idx]
```
%% Cell type:code id:859718c5 tags:
``` python
class Classifier(nn.Module):
def __init__(self, num_numerical_columns, output_size, layers_size, embeddings_size, p=0.4):
#def __init__(self, num_numerical_columns, output_size, layers_size, p=0.4):
super().__init__()
# a list of BatchNorm1d objects for all the numerical columns
self.batch_norm_num = nn.BatchNorm1d(num_numerical_columns)
# objects for all categorical columns
self.embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embeddings_size])
# dropout for embeddings to avoid overfitting
self.embedding_dropout = nn.Dropout(p)
num_categorical_columns = sum(nf for ni, nf in embeddings_size)
#num_categorical_columns = 0
all_layers = nn.ModuleList()
# all_layers = []
input_size = num_numerical_columns + num_categorical_columns
for i in layers_size:
all_layers.append(nn.Linear(input_size, i))
# activation function
all_layers.append(nn.ReLU(inplace=True))
#all_layers.append(nn.Tanh())
# batch normalization to the numerical columns
all_layers.append(nn.BatchNorm1d(i))
all_layers.append(nn.Dropout(p))
input_size = i
all_layers.append(nn.Linear(layers_size[-1], output_size))
all_layers.append(nn.Softmax(dim=1))
self.layers = nn.Sequential(*all_layers)
def forward(self, x_num, x_cat):
#def forward(self, x):
#def forward(self, x_num):
#x_num = x[:-1]
#x_cat= x[-1]
embs = []
x = self.batch_norm_num(x_num.float())
#x = x_num.float()
if x_cat is not None:
for i, e in enumerate(self.embeddings):
embs.append(e(x_cat[:, i].long()))
x_cat = torch.cat(embs, 1)
x_cat = self.embedding_dropout(x_cat)
x = torch.cat([x_cat, x], 1)
x = self.layers(x)
return x
```
%% Cell type:code id:337ffeb9 tags:
``` python
def train(model, train_dataloader, validation_data, epochs, lr):
aggregated_train_losses = []
aggregated_val_losses = []
aggregated_val_accs = []
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.0003)
loss_function = nn.NLLLoss()
#loss_function = nn.NLLLoss()
for i in trange(epochs):
oracle.train()
train_loss = 0
for i_batch, samples in enumerate(train_dataloader):
output = model(samples['numerical'], samples['categorical'])
#output = model(samples['numerical'])
single_loss = loss_function(output, samples['label'])
train_loss += single_loss.item()
optimizer.zero_grad()
single_loss.backward()
optimizer.step()
train_loss = train_loss / len(train_dataloader)
aggregated_train_losses.append(train_loss)
oracle.eval()
with torch.no_grad():
output = oracle(validation_data.numerical_data, validation_data.categorical_data)
#output = oracle(validation_data.numerical_data)
val_loss = loss_function(output, validation_data.labels).item()
aggregated_val_losses.append(val_loss)
output = torch.argmax(output, dim=1)
val_acc = accuracy_score(validation_data.labels, output)
aggregated_val_accs.append(val_acc)
return aggregated_train_losses, aggregated_val_losses, aggregated_val_accs
```
%% Cell type:code id:f2bdcc50 tags:
``` python
#train_df = pd.read_pickle('data/train_ten_best_features.pkl')
#test_df = pd.read_pickle('data/test_ten_best_features.pkl')
#test_df = test_df.loc[test_df.label.notna()]
train_df = pd.read_pickle('data/train_selected_features.pkl')
# drop columns that are nan in test data
test_df = pd.read_pickle('data/test_ten_best_features_over_reg.pkl')
for col in test_df.columns():
test_df = pd.read_pickle('data/test_selected_features.pkl')
#test_df = test_df.loc[test_df.label.notna()]
#test_df = train_df.loc[train_df.region==18].drop(columns=['region'])
#train_df = train_df.loc[train_df.region!=18].drop(columns=['region'])
#test_df = test_df.drop(columns=['region'])
#train_df = train_df.drop(columns=['region'])
# region
#train_df = train_df.loc[train_df.region==60].drop(columns=['region'])
#test_df = test_df.drop(columns=['region'])
train_df, test_df = train_test_split(train_df, test_size=0.2, random_state=1)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=1)
train_dataset = ClassificationDataSet(dataframe=train_df)
val_dataset = ClassificationDataSet(dataframe=val_df)
test_dataset = ClassificationDataSet(dataframe=test_df)
train_loader = DataLoader(train_dataset, batch_size=30, shuffle=True, drop_last=True)
```
%% Cell type:code id:1cbb6022 tags:
``` python
train_df.shape
```
%%%% Output: execute_result
(21288, 41)
%% Cell type:code id:927c9ce4 tags:
``` python
l1 = int(train_df.shape[1] - 1)
l2 = int(l1 ** 2)
l3 = int(l2 / 2)
l4 = int(l3 / 2)
oracle = Classifier(num_numerical_columns = train_dataset.numerical_data.shape[1],
output_size = 2,
#layers_size = [l1, l2, l3, l4],
layers_size = [train_df.shape[1] - 1, 40, 10, 5],
embeddings_size = train_dataset.embeddings,
p=0.4)
training_epochs = 100
lr = 0.0004
train_losses, val_losses, val_accs = train(oracle, train_loader, val_dataset, training_epochs, lr)
```
%% Cell type:code id:db3ef579 tags:
``` python
train_dataset.labels
```
%%%% Output: execute_result
tensor([0, 1, 0, ..., 1, 0, 1])
%% Cell type:code id:2e09221f tags:
``` python
for p, _ in oracle.named_parameters():
print(p)
```
%% Cell type:code id:2f34c976 tags:
``` python
sns.lineplot(data= train_losses, label='training loss')
sns.lineplot(data= val_losses, label='validation loss')
#sns.lineplot(data= val_accs, label='validation accuracy')
```
%%%% Output: execute_result
<AxesSubplot:>
%%%% Output: display_data
![]()
%% Cell type:code id:1cb5e2b3 tags:
``` python
loss_func = nn.NLLLoss()
oracle.eval()
with torch.no_grad():
y_val = oracle(test_dataset.numerical_data,
test_dataset.categorical_data)
loss = loss_func(y_val, test_dataset.labels)
out = torch.argmax(y_val, dim=1)
```
%% Cell type:code id:58fa9cbf tags:
``` python
y_val.sum(dim=0)
test_dataset.labels.sum()/len(test_dataset.labels)
```
%%%% Output: execute_result
tensor(0.4953)
%% Cell type:code id:18b5e4f3 tags:
``` python
print("Loss:", loss.item())
acc = accuracy_score(test_dataset.labels, out)
print("Accuracy:", acc)
recall = recall_score(test_dataset.labels, out)
print("Recall:", recall)
precision = precision_score(test_dataset.labels, out)
print("Precision:", precision)
```
%% Cell type:code id:cde855e6 tags:
``` python
cf_matrix = confusion_matrix(test_dataset.labels, out, normalize=None)
sns.heatmap(cf_matrix, annot=True, fmt='d')
```
%%%% Output: execute_result
<AxesSubplot:>
%%%% Output: display_data
![]()
%% Cell type:code id:7a0d2c30 tags:
``` python
train_df
```
%%%% Output: execute_result
Inverter INU temperature__maximum Inverter INU temperature__median \
18624 38.0 37.0
4983 66.0 64.0
7037 31.0 29.0
25838 0.0 0.0
9197 47.0 46.0
... ... ...
8247 30.0 29.0
20052 36.0 34.0
25093 41.0 37.0
18611 40.0 37.0
21907 0.0 0.0
blade 3 angle__maximum blade 3 pitch motor temperature__abs_energy \
18624 0.27 475893.0
4983 7.35 1960200.0
7037 0.49 4437335.0
25838 0.00 0.0
9197 0.59 1545564.0
... ... ...
8247 0.52 1398262.0
20052 10.56 625922.0
25093 0.24 554496.0
18611 0.30 665970.0
21907 90.86 74725.0
inverter inlet pressure__maximum \
18624 0.3
4983 0.7
7037 0.4
25838 0.0
9197 0.8
... ...
8247 0.1
20052 0.1
25093 0.2
18611 0.3
21907 0.0
inverter inlet pressure__standard_deviation \
18624 0.047979
4983 0.037661
7037 0.055476
25838 0.000000
9197 0.055319
... ...
8247 0.048412
20052 0.022701
25093 0.009502
18611 0.046817
21907 0.000000
inverter outlet pressure__abs_energy \
18624 5040.63
4983 3590.13
7037 2078.13
25838 0.00
9197 3480.96
... ...
8247 2108.54
20052 3594.13
25093 3105.68
18611 5231.48
21907 648.00
inverter outlet pressure__absolute_sum_of_changes \
18624 8.8
4983 15.6
7037 29.4
25838 0.0
9197 11.2
... ...
8247 13.2
20052 45.0
25093 228.8
18611 15.0
21907 0.0
inverter outlet pressure__maximum inverter outlet pressure__median \
18624 3.5 3.4
4983 2.9 2.8
7037 2.3 2.2
25838 0.0 0.0
9197 2.8 2.8
... ... ...
8247 2.3 2.2
20052 3.0 2.9
25093 3.7 2.5
18611 3.5 3.4
21907 1.2 1.2
... main bearing temperature 1__maximum \
18624 ... 46.6
4983 ... 62.6
7037 ... 34.3
25838 ... -8.5
9197 ... 61.6
... ... ...
8247 ... 47.3
20052 ... 57.5
25093 ... 37.4
18611 ... 64.3
21907 ... -4.5
main bearing temperature 1__median \
18624 46.6
4983 62.5
7037 33.4
25838 -8.6
9197 61.5
... ...
8247 47.2
20052 57.2
25093 36.9
18611 64.1
21907 -4.5
main bearing temperature 2__abs_energy \
18624 1066966.31
4983 1706222.52
7037 495015.15
25838 29600.70
9197 1653138.75
... ...
8247 941833.67
20052 1345442.36
25093 633424.00
18611 2024975.25
21907 9480.14
main bearing temperature 2__maximum \
18624 49.7
4983 61.6
7037 34.5
25838 -8.1
9197 61.1
... ...
8247 46.9
20052 57.0
25093 38.6
18611 67.3
21907 -4.5
main bearing temperature 2__median \
18624 49.4
4983 61.6
7037 33.7
25838 -8.1
9197 61.0
... ...
8247 46.7
20052 56.7
25093 37.9
18611 67.1
21907 -4.6
reactive power control status__abs_energy region \
18624 437.0 49
4983 450.0 15
7037 437.0 18
25838 449.0 60
9197 444.0 20
... ... ...
8247 431.0 20
20052 3762.0 52
25093 3969.0 57
18611 450.0 49
21907 450.0 55
wind tower ambient temperature__abs_energy \
18624 120973.96
4983 219862.51
7037 158461.85
25838 107872.25
9197 170008.61
... ...
8247 157455.60
20052 101724.48
25093 57379.94
18611 163326.89
21907 41153.03
wind tower ambient temperature__maximum \
18624 16.8
4983 22.3
7037 19.1
25838 -15.5
9197 19.7
... ...
8247 19.3
20052 15.6
25093 11.5
18611 19.3
21907 -9.5
wind tower ambient temperature__median
18624 16.6
4983 22.1
7037 19.0
25838 -15.5
9197 19.6
... ...
8247 19.1
20052 15.6
25093 11.4
18611 19.0
21907 -9.6
[21288 rows x 22 columns]
%% Cell type:markdown id:3342901f tags:
# Region Vorhersage
%% Cell type:code id:d01750f1 tags:
``` python
# find region similarity
region_data = pd.read_pickle('data/train_selected_features.pkl')
X = region_data.drop(columns=['region'])
y = region_data.region
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
dtree_model = DecisionTreeClassifier(max_depth = 10).fit(X_train, y_train)
dtree_predictions = dtree_model.predict(X_test)
accuracy_score(y_test, dtree_predictions)
```
%%%% Output: execute_result
1.0
%% Cell type:code id:881a4604-4f0c-48ad-a438-6a38c528251c tags:
``` python
region_data = pd.read_pickle('data/train_selected_features.pkl')
region_data_sample = pd.DataFrame()
n_samples = 12
for reg in region_data.region.unique():
print(reg)
reg_data_0 = region_data.loc[(region_data.region == reg) & (region_data.label == 0)]
print(len(reg_data_0))
reg_data_1 = region_data.loc[(region_data.region == reg) & (region_data.label == 1)]
print(len(reg_data_1))
region_data_sample = region_data_sample.append(reg_data_0.sample(n=n_samples))
region_data_sample = region_data_sample.append(reg_data_1.sample(n=n_samples))
X = region_data_sample.append(reg_data_0).drop(columns=['region'])
y = region_data_sample.append(reg_data_0).region
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
```
%% Cell type:code id:9e8240dd-4119-4e9d-9f6a-13b6f30b36eb tags:
``` python
region_data_sample
```
%%%% Output: execute_result
Aircraft weather station wind speed__abs_energy \
7459 14957.07
7715 2835.32
7372 22540.88