Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

rule_fit.py 13 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
  1. """Linear model of tree-based decision rules based on the rulefit algorithm from Friedman and Popescu.
  2. The algorithm can be used for predicting an output vector y given an input matrix X. In the first step a tree ensemble
  3. is generated with gradient boosting. The trees are then used to form rules, where the paths to each node in each tree
  4. form one rule. A rule is a binary decision if an observation is in a given node, which is dependent on the input features
  5. that were used in the splits. The ensemble of rules together with the original input features are then being input in a
  6. L1-regularized linear model, also called Lasso, which estimates the effects of each rule on the output target but at the
  7. same time estimating many of those effects to zero.
  8. """
  9. from typing import List, Tuple
  10. import numpy as np
  11. import pandas as pd
  12. import scipy
  13. from scipy.special import softmax
  14. from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
  15. from sklearn.base import TransformerMixin
  16. from sklearn.utils.multiclass import unique_labels
  17. from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
  18. from imodels.rule_set.rule_set import RuleSet
  19. from imodels.util.arguments import check_fit_arguments
  20. from imodels.util.extract import extract_rulefit
  21. from imodels.util.rule import get_feature_dict, replace_feature_name, Rule
  22. from imodels.util.score import score_linear
  23. from imodels.util.transforms import Winsorizer, FriedScale
  24. class RuleFit(BaseEstimator, TransformerMixin, RuleSet):
  25. """Rulefit class. Rather than using this class directly, should use RuleFitRegressor or RuleFitClassifier
  26. Parameters
  27. ----------
  28. tree_size: Number of terminal nodes in generated trees. If exp_rand_tree_size=True,
  29. this will be the mean number of terminal nodes.
  30. sample_fract: fraction of randomly chosen training observations used to produce each tree.
  31. FP 2004 (Sec. 2)
  32. max_rules: total number of terms included in the final model (both linear and rules)
  33. approximate total number of candidate rules generated for fitting also is based on this
  34. Note that actual number of candidate rules will usually be lower than this due to duplicates.
  35. memory_par: scale multiplier (shrinkage factor) applied to each new tree when
  36. sequentially induced. FP 2004 (Sec. 2)
  37. lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2
  38. by multiplying the winsorised variable by 0.4/stdev.
  39. lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear
  40. terms before standardisation.
  41. exp_rand_tree_size: If True, each boosted tree will have a different maximum number of
  42. terminal nodes based on an exponential distribution about tree_size.
  43. (Friedman Sec 3.3)
  44. include_linear: Include linear terms as opposed to only rules
  45. alpha: Regularization strength, will override max_rules parameter
  46. cv: Whether to use cross-validation scores to select the regularization strength
  47. the final regularization value out of all that satisfy max_rules. If False, the
  48. least regularization possible is used.
  49. random_state: Integer to initialise random objects and provide repeatability.
  50. tree_generator: Optional: this object will be used as provided to generate the rules.
  51. This will override almost all the other properties above.
  52. Must be GradientBoostingRegressor(), GradientBoostingClassifier(), or RandomForestRegressor()
  53. Attributes
  54. ----------
  55. rule_ensemble: RuleEnsemble
  56. The rule ensemble
  57. feature_names: list of strings, optional (default=None)
  58. The names of the features (columns)
  59. """
  60. def __init__(self,
  61. n_estimators=100,
  62. tree_size=4,
  63. sample_fract='default',
  64. max_rules=30,
  65. memory_par=0.01,
  66. tree_generator=None,
  67. lin_trim_quantile=0.025,
  68. lin_standardise=True,
  69. exp_rand_tree_size=True,
  70. include_linear=True,
  71. alpha=None,
  72. cv=True,
  73. random_state=None):
  74. self.n_estimators = n_estimators
  75. self.tree_size = tree_size
  76. self.sample_fract = sample_fract
  77. self.max_rules = max_rules
  78. self.memory_par = memory_par
  79. self.tree_generator = tree_generator
  80. self.lin_trim_quantile = lin_trim_quantile
  81. self.lin_standardise = lin_standardise
  82. self.exp_rand_tree_size = exp_rand_tree_size
  83. self.include_linear = include_linear
  84. self.alpha = alpha
  85. self.cv = cv
  86. self.random_state = random_state
  87. self.winsorizer = Winsorizer(trim_quantile=self.lin_trim_quantile)
  88. self.friedscale = FriedScale(self.winsorizer)
  89. self.stddev = None
  90. self.mean = None
  91. def fit(self, X, y=None, feature_names=None):
  92. """Fit and estimate linear combination of rule ensemble
  93. """
  94. X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
  95. self.n_features_ = X.shape[1]
  96. self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
  97. self.feature_placeholders = np.array(list(self.feature_dict_.keys()))
  98. self.feature_names = np.array(list(self.feature_dict_.values()))
  99. extracted_rules = self._extract_rules(X, y)
  100. self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(
  101. X, y, extracted_rules)
  102. self.rules_ = [
  103. replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
  104. ]
  105. # count total rule terms, plus nonzero linear terms
  106. self.complexity_ = self._get_complexity()
  107. if self.include_linear:
  108. self.complexity_ += np.sum(
  109. np.array(self.coef[:X.shape[1]]) != 0)
  110. return self
  111. def _predict_continuous_output(self, X):
  112. """Predict outcome of linear model for X
  113. """
  114. if type(X) == pd.DataFrame:
  115. X = X.values.astype(np.float32)
  116. y_pred = np.zeros(X.shape[0])
  117. y_pred += self._eval_weighted_rule_sum(X)
  118. if self.include_linear:
  119. if self.lin_standardise:
  120. X = self.friedscale.scale(X)
  121. y_pred += X @ self.coef[:X.shape[1]]
  122. return y_pred + self.intercept
  123. def predict(self, X):
  124. '''Predict. For regression returns continuous output.
  125. For classification, returns discrete output.
  126. '''
  127. check_is_fitted(self)
  128. if scipy.sparse.issparse(X):
  129. X = X.toarray()
  130. X = check_array(X)
  131. if isinstance(self, RegressorMixin):
  132. return self._predict_continuous_output(X)
  133. else:
  134. return np.argmax(self.predict_proba(X), axis=1)
  135. def predict_proba(self, X):
  136. check_is_fitted(self)
  137. if scipy.sparse.issparse(X):
  138. X = X.toarray()
  139. X = check_array(X)
  140. continuous_output = self._predict_continuous_output(X)
  141. logits = np.vstack(
  142. (1 - continuous_output, continuous_output)).transpose()
  143. return softmax(logits, axis=1)
  144. def transform(self, X=None, rules=None):
  145. """Transform dataset.
  146. Parameters
  147. ----------
  148. X : array-like matrix, shape=(n_samples, n_features)
  149. Input data to be transformed. Use ``dtype=np.float32`` for maximum
  150. efficiency.
  151. Returns
  152. -------
  153. X_transformed: matrix, shape=(n_samples, n_out)
  154. Transformed data set
  155. """
  156. df = pd.DataFrame(X, columns=self.feature_placeholders)
  157. print('df', df.dtypes, df.head())
  158. X_transformed = np.zeros((X.shape[0], len(rules)))
  159. for i, r in enumerate(rules):
  160. features_r_uses = [term.split(' ')[0] for term in r.split(' and ')]
  161. # print('r', r)
  162. # print('feats', df[features_r_uses])
  163. # print('ans', df[features_r_uses].query(r))
  164. # print(
  165. # 'tra', X_transformed[df[features_r_uses].query(r).index.values, i])
  166. X_transformed[df[features_r_uses].query(r).index.values, i] = 1
  167. return X_transformed
  168. def _get_rules(self, exclude_zero_coef=False, subregion=None):
  169. """Return the estimated rules
  170. Parameters
  171. ----------
  172. exclude_zero_coef: If True (default), returns only the rules with an estimated
  173. coefficient not equalt to zero.
  174. subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over
  175. subregion of inputs (FP 2004 eq. 30/31/32).
  176. Returns
  177. -------
  178. rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
  179. the coefficients and 'support' the support of the rule in the training
  180. data set (X)
  181. """
  182. n_features = len(self.coef) - len(self.rules_)
  183. rule_ensemble = list(self.rules_without_feature_names_)
  184. output_rules = []
  185. # Add coefficients for linear effects
  186. for i in range(0, n_features):
  187. if self.lin_standardise:
  188. coef = self.coef[i] * self.friedscale.scale_multipliers[i]
  189. else:
  190. coef = self.coef[i]
  191. if subregion is None:
  192. importance = abs(coef) * self.stddev[i]
  193. else:
  194. subregion = np.array(subregion)
  195. importance = sum(abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len(
  196. subregion)
  197. output_rules += [(self.feature_names[i],
  198. 'linear', coef, 1, importance)]
  199. # Add rules
  200. for i in range(0, len(self.rules_)):
  201. rule = rule_ensemble[i]
  202. coef = self.coef[i + n_features]
  203. if subregion is None:
  204. importance = abs(coef) * (rule.support *
  205. (1 - rule.support)) ** (1 / 2)
  206. else:
  207. rkx = self.transform(subregion, [rule])[:, -1]
  208. importance = sum(
  209. abs(coef) * abs(rkx - rule.support)) / len(subregion)
  210. output_rules += [(self.rules_[i].rule, 'rule',
  211. coef, rule.support, importance)]
  212. rules = pd.DataFrame(output_rules, columns=[
  213. "rule", "type", "coef", "support", "importance"])
  214. if exclude_zero_coef:
  215. rules = rules.ix[rules.coef != 0]
  216. return rules
  217. def visualize(self, decimals=2):
  218. rules = self._get_rules()
  219. rules = rules[rules.coef != 0].sort_values("support", ascending=False)
  220. pd.set_option('display.max_colwidth', None)
  221. return rules[['rule', 'coef']].round(decimals)
  222. def __str__(self):
  223. if not hasattr(self, 'coef'):
  224. s = self.__class__.__name__
  225. s += "("
  226. s += "max_rules="
  227. s += repr(self.max_rules)
  228. s += ")"
  229. return s
  230. else:
  231. s = '> ------------------------------\n'
  232. s += '> RuleFit:\n'
  233. s += '> \tPredictions are made by summing the coefficients of each rule\n'
  234. s += '> ------------------------------\n'
  235. return s + self.visualize().to_string(index=False) + '\n'
  236. def _extract_rules(self, X, y) -> List[str]:
  237. return extract_rulefit(X, y,
  238. feature_names=self.feature_placeholders,
  239. n_estimators=self.n_estimators,
  240. tree_size=self.tree_size,
  241. memory_par=self.memory_par,
  242. tree_generator=self.tree_generator,
  243. exp_rand_tree_size=self.exp_rand_tree_size,
  244. random_state=self.random_state)
  245. def _score_rules(self, X, y, rules) -> Tuple[List[Rule], List[float], float]:
  246. X_concat = np.zeros([X.shape[0], 0])
  247. # standardise linear variables if requested (for regression model only)
  248. if self.include_linear:
  249. # standard deviation and mean of winsorized features
  250. self.winsorizer.train(X)
  251. winsorized_X = self.winsorizer.trim(X)
  252. self.stddev = np.std(winsorized_X, axis=0)
  253. self.mean = np.mean(winsorized_X, axis=0)
  254. if self.lin_standardise:
  255. self.friedscale.train(X)
  256. X_regn = self.friedscale.scale(X)
  257. else:
  258. X_regn = X.copy()
  259. X_concat = np.concatenate((X_concat, X_regn), axis=1)
  260. X_rules = self.transform(X, rules)
  261. if X_rules.shape[0] > 0:
  262. X_concat = np.concatenate((X_concat, X_rules), axis=1)
  263. # no rules fit and self.include_linear == False
  264. if X_concat.shape[1] == 0:
  265. return [], [], 0
  266. prediction_task = 'regression' if isinstance(
  267. self, RegressorMixin) else 'classification'
  268. return score_linear(X_concat, y, rules,
  269. prediction_task=prediction_task,
  270. max_rules=self.max_rules,
  271. alpha=self.alpha,
  272. cv=self.cv,
  273. random_state=self.random_state)
  274. class RuleFitRegressor(RuleFit, RegressorMixin):
  275. ...
  276. class RuleFitClassifier(RuleFit, ClassifierMixin):
  277. ...
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...