1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
- import sys
- import os
- import errno
- import numpy as np
- import time
- import pathlib as pl
- import pandas as pd
- import yaml
- from sklearn.preprocessing import LabelEncoder
- from sklearn.model_selection import KFold
- from sklearn.model_selection import GridSearchCV
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn import metrics
- from sklearn.svm import SVC
- import joblib
- def mkdir_p(path):
- try:
- os.makedirs(path)
- except OSError as exc: # Python >2.5
- if exc.errno == errno.EEXIST and os.path.isdir(path):
- pass
- else:
- raise
- def cross_validate_model(X, y, model, model_grid, scorer, ksplits=10, seed=42,**kwargs):
- kfold = KFold(n_splits=ksplits, random_state=seed, shuffle=True)
- grid = GridSearchCV(estimator=model, param_grid=model_grid, scoring=scorer, cv=kfold, n_jobs=-1, refit='accuracy')
- grid_result = grid.fit(X, y)
- print("Best %s: %f using %s" % (grid_result.refit, grid_result.best_score_, grid_result.best_params_))
- df_scores = pd.DataFrame([], columns=['metric', 'mean_test', 'std_test', 'params', 'rank_test'])
- for mtr in scorer:
- print(mtr.upper())
- means = grid_result.cv_results_['mean_test_' + mtr]
- stds = grid_result.cv_results_['std_test_' + mtr]
- params = grid_result.cv_results_['params']
- ranks = grid_result.cv_results_['rank_test_' + mtr]
- scores = zip(means, stds, params, ranks)
- for mean, stdev, param, rank in scores:
- print("#%d %f (%f) with: %r" % (rank, mean, stdev, param))
- # Creating dataframe with the several metrics achieved for the top performer refit (default/decisive) scorer
- df_scores = df_scores.append(pd.Series([mtr, means[0], stds[0], params[0], ranks[0]],
- index=['metric', 'mean_test', 'std_test', 'params', 'rank_test'],
- name=mtr),
- ignore_index=True)
- return grid_result, df_scores
- #TODO
- def tune_models(models, grids):
- for model, grid in zip(models, grids):
- start_cv = time.time()
- grid_result, scores = cross_validate_model(X=X_train, y=y_train, model=model, model_grid=grid,
- scorer=evaluation_metric)
- end_cv = time.time()
- joblib.dump(grid_result.best_estimator_, os.path.join(models_folder, type(model).__name__ + '_model.joblib'))
- # generate dataframe to be appended to selection_metrics.csv
- metrics = pd.DataFrame([[scores.metric[i], scores.mean_test[i], int(end_cv * 1000), scores.rank_test[i]]
- for i in range(len(scores))],
- columns=['Name', 'Value', 'Timestamp', 'Step'])
- if not os.path.exists(os.path.join(metrics_folder, 'selection_metrics.csv')):
- metrics.to_csv(os.path.join(metrics_folder, 'selection_metrics.csv'), index=False)
- else:
- metrics.to_csv(os.path.join(metrics_folder, 'selection_metrics.csv'), index=False, header=False, mode='a')
- # save tuned model parameters
- with open(params_path, 'a') as yaml_path:
- yaml.safe_dump({'model': type(model).__name__}, yaml_path)
- yaml.safe_dump(grid_result.best_params_, yaml_path)
- return
- #TODO
- def select_model():
- return 'knn_model'
- if __name__ == "__main__":
- input = sys.argv[1]
- #TODO:change output to folder only (also on stage)
- output = pl.Path(sys.argv[2])
- metrics_folder = pl.Path('metrics')
- params_path = os.path.join(output, 'selection_params.yaml')
- models_folder = os.path.join(output, 'models')
- if not os.path.exists(models_folder):
- mkdir_p(models_folder)
- if not os.path.exists(metrics_folder):
- mkdir_p(metrics_folder)
- evaluation_metric = ['accuracy', 'recall', 'precision']
- train = joblib.load(input)
- y_train = train['label'].copy(deep=True)
- X_train = train.drop('label', axis=1).astype(float)
- #TODO: iteratively tune_models() with fixed grids for gridsearch
- # (or random search with hyperotp see evernote)
- ######################################## models and grids definition ##############################################
- knn = KNeighborsClassifier()
- neighbors = [1, 3, 5, 7, 9, 11, 13]
- knn_grid = dict(n_neighbors=neighbors)
- c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
- kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']
- svc_grid = dict(C=c_values, kernel=kernel_values)
- svc = SVC()
- clfs = [svc]
- clsf_grids = [svc_grid]
- tune_models(models=clfs, grids=clsf_grids)
|