1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
|
- from fastai.imports import *
- from fastai.structured import *
- from fastai.column_data import *
- from sklearn.base import TransformerMixin, BaseEstimator
- from pandas_summary import DataFrameSummary
- from sklearn.externals import joblib
- class StandardScalerJustin(TransformerMixin, BaseEstimator):
- def __init__(self, copy=True, with_mean=True, with_std=True):
- self.with_mean = with_mean
- self.with_std = with_std
- self.copy = copy
-
- def fit(self, X, y=None):
- if type(X) == np.ndarray:
- X = pd.Series(X.reshape(-1))
- self.mean_ = X.dropna().mean()
- self.var_ = X.dropna().var()
- return self
- def transform(self, X):
- mean = self.mean_
- std_dev = np.sqrt(self.var_)
- if std_dev == 0:
- return X
- return (X-mean)/std_dev
-
- def fit_scalers(df, mapper):
- warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
- if mapper is None:
- map_f = [([n],StandardScalerJustin()) for n in df.columns if is_numeric_dtype(df[n])]
- mapper = DataFrameMapper(map_f).fit(df)
- return mapper
- def proc_df_justin(df, y_fld, valid_test, skip_flds=None, do_scale=False, na_dict=None,
- preproc_fn=None, max_n_cat=None, subset=None, mapper=None, train_cols_meds=None, cols=None):
- """ proc_df takes a data frame df and splits off the response variable, and
- changes the df into an entirely numeric dataframe.
- Parameters:
- -----------
- df: The data frame you wish to process.
- y_fld: The name of the response variable
-
- valid_test: boolean indicating if this is a df to match to train columns.
- skip_flds: A list of fields that dropped from df.
- do_scale: Standardizes each column in df,Takes Boolean Values(True,False)
- na_dict: a dictionary of na columns to add. Na columns are also added if there
- are any missing values.
- preproc_fn: A function that gets applied to df.
- max_n_cat: The maximum number of categories to break into dummy values, instead
- of integer codes.
- subset: Takes a random subset of size subset from df.
- mapper: If do_scale is set as True, the mapper variable
- calculates the values used for scaling of variables during training time(mean and standard deviation).
-
- train_cols_meds: dict where keys are columns from training and values are medians, use for values to fill an entire missing column (shouldn't be needed when used to actually pick loans, was needed for train/valid/test due to new fields being added over the timeframe and missing in certain datasets while existing in others)
-
- cols: Just to compare column order and ensure the variables are in the right order.
- Returns:
- --------
- [x, y, nas, mapper(optional)]:
- x: x is the transformed version of df. x will not have the response variable
- and is entirely numeric.
- y: y is the response variable
- nas: returns a dictionary of which nas it created, and the associated median.
- mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continous
- variables which is then used for scaling of during test-time."""
- assert type(valid_test) == bool, print('must indiciate if this is test/valid set to match columns with train')
-
- if not skip_flds: skip_flds=[]
- if subset: df = get_sample(df,subset)
- df = df.copy()
- if preproc_fn: preproc_fn(df)
- y = df[y_fld].values
- df.drop(skip_flds+[y_fld], axis=1, inplace=True)
- # fit the scalers
- if do_scale: mapper = fit_scalers(df, mapper)
- if na_dict is None: na_dict = {}
- for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
- df[mapper.transformed_names_] = mapper.transform(df)
- embeddings=[]
- for n,c in df.items():
- numericalize(df, c, n, max_n_cat)
- if not is_numeric_dtype(c):
- embeddings.append(prep_embeddings(c, n))
- df = pd.get_dummies(df, dummy_na=True)
- # fix the nas
- if valid_test:
- for col, med in train_cols_meds.items():
- try:
- df[col].fillna(med, inplace=True)
- except KeyError:
- print(col)
- df[col] = med
- df = df[cols]
-
- res = [df, y, na_dict, embeddings]
- if not valid_test: res += [res[0].median(), res[0].columns]
- if do_scale: res = res + [mapper]
- return res
- def prep_embeddings(c, n):
- # allocate in embeddings for a null
- return (n, len(c.cat.categories)+1)
- def eval_models(trials, port_size, available_loans, regr_version, X_test, y_test,
- default_series, yhat_test): #regr,
- results = {}
- pct_default = {}
- test_copy = X_test.copy()
-
- default_series = default_series.loc[X_test.index]
- yhats_ys_defs = pd.DataFrame([yhat_test, y_test, default_series.values]).T
- yhats_ys_defs.rename(columns={0:'yhat', 1:'y', 2:'defaults'}, inplace=True)
- for trial in tqdm_notebook(np.arange(trials)):
- # of all test loans, grab a batch of n=available_loans
- available_idx = np.random.choice(
- np.arange(len(test_copy)), available_loans, replace=False)
- available_loans_df = yhats_ys_defs.ix[available_idx,:]
- available_loans_df.sort_values('yhat', inplace=True, ascending=False)
- picks = available_loans_df[:port_size]
- results[trial] = picks['y'].mean()
- pct_default[trial] = picks['defaults'].sum()/port_size
- pct_default_series = pd.Series(pct_default)
- results_df = pd.DataFrame(pd.Series(results))
- results_df['pct_def'] = pct_default_series
- results_df.columns = pd.MultiIndex(levels=[[regr_version], [0.07, 'pct_def']],
- labels=[[0, 0,], [0, 1,]],
- names=['discount_rate', 'model'])
- return results_df
- def load_RF():
- return joblib.load(f'{PATH_RF}{regr_version_RF}_{training_type}.pkl')
- def load_NN():
- with open(f'{data_save_path}/for_proc_df_model_loading.pkl', 'rb') as handle:
- nas_all_train, embeddings_all_train, train_cols_meds_all_train, cols_all_train, mean_stdev_mapper_all_train, dl_df_train, dl_ys_train, cat_vars, emb_szs = pickle.load(handle)
- val_idxs = [0]
- bs = 64
- X_test = None
- regr_version_NN = '1.0.1'
- training_type = 'all'
- md = ColumnarModelData.from_data_frame(PATH_NN, val_idxs, dl_df_train, dl_ys_train, cat_vars, bs, test_df=X_test)
- n_cont = len(dl_df_train.columns)-len(cat_vars)
- nn = md.get_learner(emb_szs, n_cont, 0.05, 1, [1000,500,500,250,250], [0.2,0.2,.2,.15,.05])
- nn.load(f'{PATH_NN}{regr_version_NN}_{training_type}.pth')
-
- def add_dateparts(df):
- '''Uses the fastai add_datepart to turn datetimes into numbers to process
- does not do it for issue_d'''
- date_cols = df.select_dtypes(['datetime64']).columns
- for date_col in date_cols:
- if date_col not in special_cols:
- add_datepart(df, date_col, drop=True)
- return [col for col in date_cols if col not in special_cols]
-
- # for saving
- special_cols = []
- platform = 'lendingclub'
- datapath = '/home/justin/all_data/'
- PATH_NN = f'{datapath}{platform}/NN/'
- PATH_RF = f'{datapath}{platform}/RF/'
- data_save_path = f'{datapath}{platform}/'
- training_type = 'all'
- regr_version_RF = '0.2.2'
- regr_version_NN = '1.0.1'
|