1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
|
- import numpy as np
- class LocalDecisionStump:
- """
- An object that implements a callable local decision stump function and
- that also includes some meta-data that allows for an API to interact with
- other methods in the package.
- A local decision stump is a tri-valued function that is zero outside of a
- rectangular region, and on that region, takes either a positive or negative
- value, depending on whether a single designated feature is above or below a
- threshold. For more information on what a local decision stump is, refer to
- our paper.
- Parameters
- ----------
- feature: int
- Feature used in the decision stump
- threshold: float
- Threshold used in the decision stump
- left_val: float
- The value taken when x_k <= threshold
- right_val: float
- The value taken when x_k > threshold
- a_features: list of ints
- List of ancestor feature indices (ordered from highest ancestor to
- lowest)
- a_thresholds: list of floats
- List of ancestor thresholds (ordered from highest ancestor to lowest)
- a_signs: list of bools
- List of signs indicating whether the current node is in the left child
- (False) or right child (True) of the ancestor nodes (ordered from
- highest ancestor to lowest)
- """
- def __init__(self, feature, threshold, left_val, right_val, a_features,
- a_thresholds, a_signs):
- self.feature = feature
- self.threshold = threshold
- self.left_val = left_val
- self.right_val = right_val
- self.a_features = a_features
- self.a_thresholds = a_thresholds
- self.a_signs = a_signs
- def __call__(self, data):
- """
- Return values of the local decision stump function on an input data
- matrix with samples as rows
- Parameters
- ----------
- data: array-like of shape (n_samples, n_features)
- Data matrix to feed into the local decision stump function
- Returns
- -------
- values: array-like of shape (n_samples,)
- Function values on the data
- """
- root_to_stump_path_indicators = \
- _compare_all(data, self.a_features, np.array(self.a_thresholds),
- np.array(self.a_signs))
- in_node = np.all(root_to_stump_path_indicators, axis=1).astype(int)
- is_right = _compare(data, self.feature, self.threshold).astype(int)
- values = in_node * (is_right * self.right_val +
- (1 - is_right) * self.left_val)
- return values
- def __repr__(self):
- return f"LocalDecisionStump(feature={self.feature}, " \
- f"threshold={self.threshold}, left_val={self.left_val}, " \
- f"right_val={self.right_val}, a_features={self.a_features}, " \
- f"a_thresholds={self.a_thresholds}, " \
- f"a_signs={self.a_signs})"
- def get_depth(self):
- """
- Get depth of the local decision stump, i.e. count how many ancenstor
- nodes it has. The root node has depth 0.
- """
- return len(self.a_features)
- def make_stump(node_no, tree_struct, parent_stump, is_right_child,
- normalize=False):
- """
- Create a single local decision stump corresponding to a node in a
- scikit-learn tree structure object. The nonzero values of the stump are
- chosen so that the vector of local decision stump values over the training
- set (used to fit the tree) is orthogonal to those of all ancestor nodes.
- Parameters
- ----------
- node_no: int
- The index of the node
- tree_struct: object
- The scikit-learn tree object
- parent_stump: LocalDecisionStump object
- The local decision stump corresponding to the parent of the node in q
- uestion
- is_right_child: bool
- True if the new node is the right child of the parent node, False
- otherwise
- normalize: bool
- Flag. If set to True, then divide the nonzero function values by
- sqrt(n_samples in node) so that the vector of function values on the
- training set has unit norm. If False, then do not divide, so that the
- vector of function values on the training set has norm equal to
- n_samples in node.
- Returns
- -------
- """
- # Get features, thresholds and signs for ancestors
- if parent_stump is None: # If root node
- a_features = []
- a_thresholds = []
- a_signs = []
- else:
- a_features = parent_stump.a_features + [parent_stump.feature]
- a_thresholds = parent_stump.a_thresholds + [parent_stump.threshold]
- a_signs = parent_stump.a_signs + [is_right_child]
- # Get indices for left and right children of the node in question
- left_child = tree_struct.children_left[node_no]
- right_child = tree_struct.children_right[node_no]
- # Get quantities relevant to the node in question
- feature = tree_struct.feature[node_no]
- threshold = tree_struct.threshold[node_no]
- left_size = tree_struct.weighted_n_node_samples[left_child]
- right_size = tree_struct.weighted_n_node_samples[right_child]
- parent_size = tree_struct.weighted_n_node_samples[node_no]
- normalization = parent_size if normalize else 1
- left_val = - np.sqrt(right_size / (left_size * normalization))
- right_val = np.sqrt(left_size / (right_size * normalization))
- return LocalDecisionStump(feature, threshold, left_val, right_val,
- a_features, a_thresholds, a_signs)
- def make_stumps(tree_struct, normalize=False):
- """
- Create a collection of local decision stumps corresponding to all internal
- nodes in a scikit-learn tree structure object.
- Parameters
- ----------
- tree_struct: object
- The scikit-learn tree object
- normalize: bool
- Flag. If set to True, then divide the nonzero function values by
- sqrt(n_samples in node) so that the vector of function values on the
- training set has unit norm. If False, then do not divide, so that the
- vector of function values on the training set has norm equal to
- n_samples in node.
- Returns
- -------
- stumps: list of LocalDecisionStump objects
- The local decision stumps corresponding to all internal node in the
- tree structure
- """
- stumps = []
- def make_stump_iter(node_no, tree_struct, parent_stump, is_right_child,
- normalize, stumps):
- """
- Helper function for iteratively making local decision stump objects and
- appending them to the list stumps.
- """
- new_stump = make_stump(node_no, tree_struct, parent_stump,
- is_right_child, normalize)
- stumps.append(new_stump)
- left_child = tree_struct.children_left[node_no]
- right_child = tree_struct.children_right[node_no]
- if tree_struct.feature[left_child] != -2: # is not leaf
- make_stump_iter(left_child, tree_struct, new_stump, False,
- normalize, stumps)
- if tree_struct.feature[right_child] != -2: # is not leaf
- make_stump_iter(right_child, tree_struct, new_stump, True,
- normalize, stumps)
- make_stump_iter(0, tree_struct, None, None, normalize, stumps)
- return stumps
- def tree_feature_transform(stumps, X):
- """
- Transform the data matrix X using a mapping derived from a collection of
- local decision stump functions.
- If the list of stumps is empty, return an array of shape (0, n_samples).
- Parameters
- ----------
- stumps: list of LocalDecisionStump objects
- List of stump functions to use to transform data
- X: array-like of shape (n_samples, n_features)
- Original data matrix
- Returns
- -------
- X_transformed: array-like of shape (n_samples, n_stumps)
- Transformed data matrix
- """
- transformed_feature_vectors = [np.empty((X.shape[0], 0))]
- for stump in stumps:
- transformed_feature_vec = stump(X)[:, np.newaxis]
- transformed_feature_vectors.append(transformed_feature_vec)
- X_transformed = np.hstack(transformed_feature_vectors)
- return X_transformed
- def _compare(data, k, threshold, sign=True):
- """
- Obtain indicator vector for the samples with k-th feature > threshold
- """
- if sign:
- return data[:, k] > threshold
- else:
- return data[:, k] <= threshold
- def _compare_all(data, ks, thresholds, signs):
- """
- Obtain indicator vector for the samples with k-th feature > threshold or
- <= threshold (depending on sign) for all k in ks
- """
- return ~np.logical_xor(data[:, ks] > thresholds, signs)
|