1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
- import os
- from os.path import join as oj
- import sys
- sys.path.append('../src')
- import numpy as np
- import torch
- import scipy
- from matplotlib import pyplot as plt
- from sklearn import metrics
- import data
- from config import *
- from tqdm import tqdm
- import pickle as pkl
- import train_reg
- from copy import deepcopy
- import config
- import models
- import pandas as pd
- import features
- import outcomes
- import neural_networks
- from sklearn.model_selection import KFold
- from torch import nn, optim
- from torch.nn import functional as F
- from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
- from sklearn.linear_model import LinearRegression, RidgeCV
- from sklearn.svm import SVR
- from collections import defaultdict
- import pickle as pkl
- if __name__ == '__main__':
-
- outcome_def = 'successful_full'
- print("loading data")
- dsets = ['clath_aux_dynamin']
- splits = ['test']
- #feat_names = ['X_same_length_normalized'] + data.select_final_feats(data.get_feature_names(df))
- #['mean_total_displacement', 'mean_square_displacement', 'lifetime']
- meta = ['cell_num', 'Y_sig_mean', 'Y_sig_mean_normalized', 'X_max_orig', outcome_def]
- dfs, feat_names = data.load_dfs_for_lstm(dsets=dsets,
- splits=splits,
- meta=meta,
- length=40,
- padding='end')
- df_test = pd.concat([dfs[(k, s)]
- for (k, s) in dfs
- if s == 'test'])[feat_names + meta]
- #df_test = df_test.dropna()
- X1 = df_test[feat_names[:1]]
- X2 = df_test[feat_names[1:]]
- X2 = X2.fillna(X2.mean())
- y = df_test[outcome_def].values
-
- accuracy = {}
-
- for k in [1, 2, 5, 10]:
- for j in tqdm(range(10)):
-
- checkpoint_fname = f'../models/models_different_size_10/downsample_{k}_batch_{j}_lstm.pkl'
- results = pkl.load(open(checkpoint_fname, 'rb'))
- dnn = neural_networks.neural_net_sklearn(D_in=40, H=20, p=0, arch='lstm', epochs=200)
- dnn.model.load_state_dict(results['model_state_dict'])
- preds = dnn.predict(X1)
- preds_binary = np.logical_and((preds > 0), df_test['X_max_orig'].values > 1500).astype(int)
- accuracy[(k, j, 'lstm', 'accuracy')] = np.mean(y == preds_binary)
- accuracy[(k, j, 'lstm', 'f1')] = metrics.f1_score(y, preds_binary)
- accuracy[(k, j, 'lstm', 'roc.auc')] = metrics.roc_auc_score(y, preds)
- checkpoint_fname = f'../models/models_different_size_10/downsample_{k}_batch_{j}_gb.pkl'
- m = pkl.load(open(checkpoint_fname, 'rb'))
- preds = m.predict(X2)
- preds_binary = np.logical_and((preds > 0), df_test['X_max_orig'].values > 1500).astype(int)
- accuracy[(k, j, 'gb', 'accuracy')] = np.mean(y == preds_binary)
- accuracy[(k, j, 'gb', 'f1')] = metrics.f1_score(y, preds_binary)
- accuracy[(k, j, 'gb', 'roc.auc')] = metrics.roc_auc_score(y, preds)
-
- pkl.dump(accuracy, open(f'../reports/data_size_stability_10_{outcome_def}.pkl', 'wb'))
-
-
- # calculate dasc accuracy
- dasc_pred = (df_test['X_d1'].values > 0).astype(int)
- dasc_acc = np.mean(y == dasc_pred)
- pkl.dump(dasc_acc, open('../reports/data_size_stability_10_dasc_acc.pkl', 'wb'))
-
|