Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

testing_hyperopt_script.py 5.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
  1. import os
  2. import sys
  3. import argparse
  4. import pickle
  5. import joblib
  6. import numpy as np
  7. import pandas as pd
  8. from sklearn.linear_model import LogisticRegression
  9. from catboost import CatBoostRegressor, CatBoostClassifier
  10. from lendingclub import config, utils
  11. import j_utils.munging as mg
  12. from torch import tensor, nn
  13. from hyperopt import fmin, tpe, hp, STATUS_OK, STATUS_FAIL, Trials
  14. def cross_entropy(X,y):
  15. """
  16. X is the output from fully connected layer (num_examples x num_classes)
  17. y is labels (num_examples x 1)
  18. Note that y is not one-hot encoded vector.
  19. It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
  20. """
  21. m = y.shape[0]
  22. p = X
  23. # We use multidimensional array indexing to extract
  24. # softmax probability of the correct label for each sample.
  25. # Refer to https://docs.scipy.org/doc/numpy/user/basics.indexing.html#indexing-multi-dimensional-arrays for understanding multidimensional array indexing.
  26. log_likelihood = -np.log(p[range(m),y])
  27. loss = np.sum(log_likelihood) / m
  28. return loss
  29. def prepare_data(model_n, data, proc=None, ds_type='train'):
  30. '''
  31. returns the processed data for a model, which could be different between
  32. model types e.g. can handle categoricals or not. additionally returns
  33. a tuple of anything necessary to process valid/test data in the same manner
  34. ds_type must be 'train', 'valid', or 'test'
  35. '''
  36. assert ds_type in ['train', 'valid', 'test'], print('ds_type invalid')
  37. if model_n in ['baseline', 'A', 'B', 'C', 'D', 'E', 'F', 'G']:
  38. return data, None
  39. # elif model_n == 'logistic_regr':
  40. else:
  41. if ds_type == 'train':
  42. temp = mg.train_proc(data)
  43. procced = temp[0]
  44. return procced, temp[1:]
  45. elif ds_type in ['test', 'valid']:
  46. assert proc, print('must pass data processing artifacts')
  47. temp = mg.val_test_proc(data, *proc)
  48. return temp
  49. class HPOpt(object):
  50. def __init__(self, x_train, x_test, y_train, y_test):
  51. self.x_train = x_train
  52. self.x_test = x_test
  53. self.y_train = y_train
  54. self.y_test = y_test
  55. def process(self, fn_name, space, trials, algo, max_evals):
  56. fn = getattr(self, fn_name)
  57. # try:
  58. result = fmin(fn=fn, space=space, algo=algo, max_evals=max_evals, trials=trials)
  59. # except Exception as e:
  60. # return {'status': STATUS_FAIL,
  61. # 'exception': str(e)}
  62. return result, trials
  63. # def xgb_reg(self, para):
  64. # reg = xgb.XGBRegressor(**para['reg_params'])
  65. # return self.train_reg(reg, para)
  66. # def lgb_reg(self, para):
  67. # reg = lgb.LGBMRegressor(**para['reg_params'])
  68. # return self.train_reg(reg, para)
  69. def ctb_regr(self, para):
  70. reg = CatBoostRegressor(**para['regr_params'])
  71. return self.train_reg(reg, para)
  72. def ctb_clf(self, para):
  73. clf = CatBoostClassifier(**para['clf_params'])
  74. return self.train_clf(clf, para)
  75. def train_reg(self, reg, para):
  76. reg.fit(self.x_train, self.y_train,
  77. eval_set=(self.x_test, self.y_test),
  78. **para['fit_params'])
  79. pred = reg.predict(self.x_test)
  80. loss = para['loss_func'](self.y_test, pred)
  81. return {'loss': loss, 'status': STATUS_OK}
  82. def train_clf(self, clf, para):
  83. clf.fit(self.x_train, self.y_train,
  84. eval_set=(self.x_test, self.y_test),
  85. **para['fit_params'])
  86. pred = clf.predict_proba(self.x_test)
  87. loss = para['loss_func'](pred, self.y_test.values)
  88. return {'loss': loss, 'status': STATUS_OK}
  89. model_n = 'catboost_clf'
  90. loss = cross_entropy
  91. # CatBoost parameters
  92. ctb_clf_params = {
  93. 'learning_rate': hp.choice('learning_rate', np.geomspace(.005, .5, num=5)),
  94. # 'max_depth': hp.choice('max_depth', np.arange(1,16, 1)),
  95. # 'colsample_bylevel': hp.choice('colsample_bylevel',
  96. # np.arange(0.1, 1.0, 0.1)),
  97. 'n_estimators': 10,
  98. 'eval_metric': hp.choice('eval_metric', ['Logloss', 'CrossEntropy']),
  99. 'task_type':'GPU',
  100. }
  101. ctb_fit_params = {'early_stopping_rounds': 5, 'verbose': False}
  102. ctb_para = dict()
  103. ctb_para['clf_params'] = ctb_clf_params
  104. ctb_para['fit_params'] = ctb_fit_params
  105. ctb_para['loss_func'] = loss
  106. loss(np.array([[1,0], [0,1]]), np.array([0,1]))
  107. tr_val_base_data, tr_val_eval_data, _ = utils.load_dataset(ds_type='train')
  108. tscv = mg.time_series_data_split(tr_val_eval_data, 'issue_d', 20, 1)
  109. for tr_idx, val_idx in tscv:
  110. # split out validation from train_data
  111. if model_n in ['logistic_regr', 'catboost_clf']:
  112. y_train = tr_val_eval_data.loc[tr_idx, 'target_loose']
  113. y_valid = tr_val_eval_data.loc[val_idx, 'target_loose']
  114. else:
  115. y_train = tr_val_eval_data.loc[tr_idx, '0.07']
  116. y_valid = tr_val_eval_data.loc[val_idx, '0.07']
  117. X_train = tr_val_base_data.loc[tr_idx]
  118. X_valid = tr_val_base_data.loc[val_idx]
  119. X_train, proc_arti = prepare_data(model_n, X_train, ds_type='train')
  120. X_valid = prepare_data(model_n, X_valid, proc = proc_arti, ds_type='valid')
  121. X_train.shape
  122. obj = HPOpt(X_train, X_valid, y_train, y_valid)
  123. ctb_opt = obj.process(fn_name='ctb_clf', space=ctb_para, trials=Trials(), algo=tpe.suggest, max_evals=10)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...