Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

investing_utils2.py 7.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
  1. from fastai.imports import *
  2. from fastai.structured import *
  3. from fastai.column_data import *
  4. from sklearn.base import TransformerMixin, BaseEstimator
  5. from pandas_summary import DataFrameSummary
  6. from sklearn.externals import joblib
  7. class StandardScalerJustin(TransformerMixin, BaseEstimator):
  8. def __init__(self, copy=True, with_mean=True, with_std=True):
  9. self.with_mean = with_mean
  10. self.with_std = with_std
  11. self.copy = copy
  12. def fit(self, X, y=None):
  13. if type(X) == np.ndarray:
  14. X = pd.Series(X.reshape(-1))
  15. self.mean_ = X.dropna().mean()
  16. self.var_ = X.dropna().var()
  17. return self
  18. def transform(self, X):
  19. mean = self.mean_
  20. std_dev = np.sqrt(self.var_)
  21. if std_dev == 0:
  22. return X
  23. return (X-mean)/std_dev
  24. def fit_scalers(df, mapper):
  25. warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
  26. if mapper is None:
  27. map_f = [([n],StandardScalerJustin()) for n in df.columns if is_numeric_dtype(df[n])]
  28. mapper = DataFrameMapper(map_f).fit(df)
  29. return mapper
  30. def proc_df_justin(df, y_fld, valid_test, skip_flds=None, do_scale=False, na_dict=None,
  31. preproc_fn=None, max_n_cat=None, subset=None, mapper=None, train_cols_meds=None, cols=None):
  32. """ proc_df takes a data frame df and splits off the response variable, and
  33. changes the df into an entirely numeric dataframe.
  34. Parameters:
  35. -----------
  36. df: The data frame you wish to process.
  37. y_fld: The name of the response variable
  38. valid_test: boolean indicating if this is a df to match to train columns.
  39. skip_flds: A list of fields that dropped from df.
  40. do_scale: Standardizes each column in df,Takes Boolean Values(True,False)
  41. na_dict: a dictionary of na columns to add. Na columns are also added if there
  42. are any missing values.
  43. preproc_fn: A function that gets applied to df.
  44. max_n_cat: The maximum number of categories to break into dummy values, instead
  45. of integer codes.
  46. subset: Takes a random subset of size subset from df.
  47. mapper: If do_scale is set as True, the mapper variable
  48. calculates the values used for scaling of variables during training time(mean and standard deviation).
  49. train_cols_meds: dict where keys are columns from training and values are medians, use for values to fill an entire missing column (shouldn't be needed when used to actually pick loans, was needed for train/valid/test due to new fields being added over the timeframe and missing in certain datasets while existing in others)
  50. cols: Just to compare column order and ensure the variables are in the right order.
  51. Returns:
  52. --------
  53. [x, y, nas, mapper(optional)]:
  54. x: x is the transformed version of df. x will not have the response variable
  55. and is entirely numeric.
  56. y: y is the response variable
  57. nas: returns a dictionary of which nas it created, and the associated median.
  58. mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continous
  59. variables which is then used for scaling of during test-time."""
  60. assert type(valid_test) == bool, print('must indiciate if this is test/valid set to match columns with train')
  61. if not skip_flds: skip_flds=[]
  62. if subset: df = get_sample(df,subset)
  63. df = df.copy()
  64. if preproc_fn: preproc_fn(df)
  65. y = df[y_fld].values
  66. df.drop(skip_flds+[y_fld], axis=1, inplace=True)
  67. # fit the scalers
  68. if do_scale: mapper = fit_scalers(df, mapper)
  69. if na_dict is None: na_dict = {}
  70. for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
  71. df[mapper.transformed_names_] = mapper.transform(df)
  72. embeddings=[]
  73. for n,c in df.items():
  74. numericalize(df, c, n, max_n_cat)
  75. if not is_numeric_dtype(c):
  76. embeddings.append(prep_embeddings(c, n))
  77. df = pd.get_dummies(df, dummy_na=True)
  78. # fix the nas
  79. if valid_test:
  80. for col, med in train_cols_meds.items():
  81. try:
  82. df[col].fillna(med, inplace=True)
  83. except KeyError:
  84. print(col)
  85. df[col] = med
  86. df = df[cols]
  87. res = [df, y, na_dict, embeddings]
  88. if not valid_test: res += [res[0].median(), res[0].columns]
  89. if do_scale: res = res + [mapper]
  90. return res
  91. def prep_embeddings(c, n):
  92. # allocate in embeddings for a null
  93. return (n, len(c.cat.categories)+1)
  94. def eval_models(trials, port_size, available_loans, regr_version, X_test, y_test,
  95. default_series, yhat_test): #regr,
  96. results = {}
  97. pct_default = {}
  98. test_copy = X_test.copy()
  99. default_series = default_series.loc[X_test.index]
  100. yhats_ys_defs = pd.DataFrame([yhat_test, y_test, default_series.values]).T
  101. yhats_ys_defs.rename(columns={0:'yhat', 1:'y', 2:'defaults'}, inplace=True)
  102. for trial in tqdm_notebook(np.arange(trials)):
  103. # of all test loans, grab a batch of n=available_loans
  104. available_idx = np.random.choice(
  105. np.arange(len(test_copy)), available_loans, replace=False)
  106. available_loans_df = yhats_ys_defs.ix[available_idx,:]
  107. available_loans_df.sort_values('yhat', inplace=True, ascending=False)
  108. picks = available_loans_df[:port_size]
  109. results[trial] = picks['y'].mean()
  110. pct_default[trial] = picks['defaults'].sum()/port_size
  111. pct_default_series = pd.Series(pct_default)
  112. results_df = pd.DataFrame(pd.Series(results))
  113. results_df['pct_def'] = pct_default_series
  114. results_df.columns = pd.MultiIndex(levels=[[regr_version], [0.07, 'pct_def']],
  115. labels=[[0, 0,], [0, 1,]],
  116. names=['discount_rate', 'model'])
  117. return results_df
  118. def load_RF():
  119. return joblib.load(f'{PATH_RF}{regr_version_RF}_{training_type}.pkl')
  120. def load_NN():
  121. with open(f'{data_save_path}/for_proc_df_model_loading.pkl', 'rb') as handle:
  122. nas_all_train, embeddings_all_train, train_cols_meds_all_train, cols_all_train, mean_stdev_mapper_all_train, dl_df_train, dl_ys_train, cat_vars, emb_szs = pickle.load(handle)
  123. val_idxs = [0]
  124. bs = 64
  125. X_test = None
  126. regr_version_NN = '1.0.1'
  127. training_type = 'all'
  128. md = ColumnarModelData.from_data_frame(PATH_NN, val_idxs, dl_df_train, dl_ys_train, cat_vars, bs, test_df=X_test)
  129. n_cont = len(dl_df_train.columns)-len(cat_vars)
  130. nn = md.get_learner(emb_szs, n_cont, 0.05, 1, [1000,500,500,250,250], [0.2,0.2,.2,.15,.05])
  131. nn.load(f'{PATH_NN}{regr_version_NN}_{training_type}.pth')
  132. def add_dateparts(df):
  133. '''Uses the fastai add_datepart to turn datetimes into numbers to process
  134. does not do it for issue_d'''
  135. date_cols = df.select_dtypes(['datetime64']).columns
  136. for date_col in date_cols:
  137. if date_col not in special_cols:
  138. add_datepart(df, date_col, drop=True)
  139. return [col for col in date_cols if col not in special_cols]
  140. # for saving
  141. special_cols = []
  142. platform = 'lendingclub'
  143. datapath = '/home/justin/all_data/'
  144. PATH_NN = f'{datapath}{platform}/NN/'
  145. PATH_RF = f'{datapath}{platform}/RF/'
  146. data_save_path = f'{datapath}{platform}/'
  147. training_type = 'all'
  148. regr_version_RF = '0.2.2'
  149. regr_version_NN = '1.0.1'
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...