Expand source code
import copy
import warnings
import numpy as np
from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from imodels.importance.representation import TreeTransformer
class R2F:
"""
Class to compute R2F feature importance values.
:param estimator: scikit-learn estimator,
default=RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33)
The scikit-learn tree or tree ensemble estimator object
:param max_components_type: {"auto", "median_splits", "max_splits", "nsamples", "nstumps", "min_nsamples_nstumps",
"min_fracnsamples_nstumps"} or int, default="auto"
Method for choosing the max number of components for PCA transformer for each sub-representation corresponding
to an original feature:
- If "auto", then same settings are used as "min_fracnsamples_nstumps"
- If "median_splits", then max_components is alpha * median number of splits on the original feature
among trees in the estimator
- If "max_splits", then max_components is alpha * maximum number of splits on the original feature among
trees in the estimator
- If "nsamples", then max_components is alpha * n_samples
- If "nstumps", then max_components is alpha * n_stumps
- If "min_nsamples_nstumps", then max_components is alpha * min(n_samples, n_stumps), where n_stumps is
total number of local decision stumps splitting on that feature in the ensemble
- If "min_fracnsamples_nstumps", then max_components is min(alpha * n_samples, n_stumps), where n_stumps is
total number of local decision stumps splitting on that feature in the ensemble
- If int, then max_components is the given integer
:param alpha: float, default=0.5
Parameter for adjusting the max number of components for PCA.
:param normalize: bool, default=False
Flag. If set to True, then divide the nonzero function values for each local decision stump by
sqrt(n_samples in node) so that the vector of function values on the training set has unit norm. If False,
then do not divide, so that the vector of function values on the training set has norm equal to n_samples
in node.
:param random_state: int, default=None
Random seed for sample splitting
:param criterion: {"aic", "bic", "cv"}, default="bic"
Criterion used for lasso model selection
:param refit: bool, default=True
If True, refit OLS after doing lasso model selection to compute r2 values, if not, compute r2 values from the
lasso model
:param add_raw: bool, default=True
If true, concatenate X_k with the learnt representation from PCA
:param n_splits: int, default=10
The number of splits to use to compute r2f values
"""
def __init__(self, estimator=None, max_components_type="auto", alpha=0.5, normalize=False, random_state=None,
criterion="bic", refit=True, add_raw=True, n_splits=10):
if estimator is None:
self.estimator = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33)
else:
self.estimator = estimator
self.max_components_type = max_components_type
self.alpha = alpha
self.normalize = normalize
self.random_state = random_state
self.criterion = criterion
self.refit = refit
self.add_raw = add_raw
self.n_splits = n_splits
def get_importance_scores(self, X, y, sample_weight=None, diagnostics=False):
"""
Compute R2F feature importance values.
:param X: array-like of shape (n_samples, n_features)
Covariate data matrix
:param y: array-like of shape (n_samples,)
Vector of responses
:param sample_weight: array-like of shape (n_samples,) or None, default=None
Sample weights to use in fitting the tree ensemble for feature learning
:param diagnostics: bool
If False, return only the r2f values. If True, also return the r2 values, number of stumps, and
number of PCs chosen in lasso model selection for each feature and over each split
:return:
r2f_values: array-like of shape (n_features,)
The computed r2f values
r_squared: array-like of shape (n_splits, n_features)
The r2 values (for each feature) over each run
n_stumps: array-like of shape (n_splits, n_features)
The number of stumps in the ensemble (splitting on each feature) over each run
n_components_chosen: array-like of shape (n_splits, n_features)
The number of PCs chosen (for each feature) over each run
"""
n_features = X.shape[1]
r_squared = np.zeros((self.n_splits, n_features))
n_components_chosen = np.zeros((self.n_splits, n_features))
n_stumps = np.zeros((self.n_splits, n_features))
if self.n_splits % 2 != 0:
raise ValueError("n_splits has to be an even integer")
if sample_weight is None:
sample_weight = np.ones_like(y)
for i in range(self.n_splits // 2):
if self.random_state is not None:
random_state = self.random_state + i
else:
random_state = None
X_a, X_b, y_a, y_b, sample_weight_a, sample_weight_b = \
train_test_split(X, y, sample_weight, test_size=0.5, random_state=random_state)
tree_transformer_a = self._feature_learning_one_split(X_a, y_a, sample_weight_a)
r_squared[2*i, :], n_stumps[2*i, :], n_components_chosen[2*i, :] = \
self._model_selection_r2_one_split(tree_transformer_a, X_b, y_b)
tree_transformer_b = self._feature_learning_one_split(X_b, y_b, sample_weight_b)
r_squared[2*i+1, :], n_stumps[2*i+1, :], n_components_chosen[2*i+1, :] = \
self._model_selection_r2_one_split(tree_transformer_b, X_a, y_a)
r2f_values = np.mean(r_squared, axis=0)
if diagnostics:
return r2f_values, r_squared, n_stumps, n_components_chosen
else:
return r2f_values
def _feature_learning_one_split(self, X_train, y_train, sample_weight=None):
"""
Step 1 and 2 of r2f: Fit the RF (or other tree ensemble) and learn feature representations from it,
storing the information in the TreeTransformer class
"""
if self.max_components_type == "auto":
max_components_type = "min_fracnsamples_nstumps"
else:
max_components_type = self.max_components_type
estimator = copy.deepcopy(self.estimator)
estimator.fit(X_train, y_train, sample_weight=sample_weight)
tree_transformer = TreeTransformer(estimator=estimator, max_components_type=max_components_type,
alpha=self.alpha, normalize=self.normalize)
tree_transformer.fit(X_train)
return tree_transformer
def _model_selection_r2_one_split(self, tree_transformer, X_val, y_val):
"""
Step 3 of r2f: Do lasso model selection and compute r2 values
"""
n_features = X_val.shape[1]
r_squared = np.zeros(n_features)
n_components_chosen = np.zeros(n_features)
n_stumps = np.zeros(n_features)
for k in range(n_features):
X_transformed = tree_transformer.transform_one_feature(X_val, k)
n_stumps[k] = len(tree_transformer.get_stumps_for_feature(k))
if X_transformed is None:
r_squared[k] = 0
n_components_chosen[k] = 0
else:
y_val_centered = y_val - np.mean(y_val)
if self.add_raw:
X_transformed = np.hstack([X_val[:, [k]] - np.mean(X_val[:, k]), X_transformed])
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
if self.criterion == "cv":
lasso = LassoCV(fit_intercept=False, normalize=False)
else:
lasso = LassoLarsIC(criterion=self.criterion, normalize=False, fit_intercept=False)
lasso.fit(X_transformed, y_val_centered)
n_components_chosen[k] = np.count_nonzero(lasso.coef_)
if self.refit:
support = np.nonzero(lasso.coef_)[0]
if len(support) == 0:
r_squared[k] = 0.0
else:
lr = LinearRegression().fit(X_transformed[:, support], y_val_centered)
r_squared[k] = lr.score(X_transformed[:, support], y_val_centered)
else:
r_squared[k] = lasso.score(X_transformed, y_val_centered)
return r_squared, n_stumps, n_components_chosen
Classes
class R2F (estimator=None, max_components_type='auto', alpha=0.5, normalize=False, random_state=None, criterion='bic', refit=True, add_raw=True, n_splits=10)
-
Class to compute R2F feature importance values.
:param estimator: scikit-learn estimator, default=RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33) The scikit-learn tree or tree ensemble estimator object
:param max_components_type: {"auto", "median_splits", "max_splits", "nsamples", "nstumps", "min_nsamples_nstumps", "min_fracnsamples_nstumps"} or int, default="auto" Method for choosing the max number of components for PCA transformer for each sub-representation corresponding to an original feature: - If "auto", then same settings are used as "min_fracnsamples_nstumps" - If "median_splits", then max_components is alpha * median number of splits on the original feature among trees in the estimator - If "max_splits", then max_components is alpha * maximum number of splits on the original feature among trees in the estimator - If "nsamples", then max_components is alpha * n_samples - If "nstumps", then max_components is alpha * n_stumps - If "min_nsamples_nstumps", then max_components is alpha * min(n_samples, n_stumps), where n_stumps is total number of local decision stumps splitting on that feature in the ensemble - If "min_fracnsamples_nstumps", then max_components is min(alpha * n_samples, n_stumps), where n_stumps is total number of local decision stumps splitting on that feature in the ensemble - If int, then max_components is the given integer
:param alpha: float, default=0.5 Parameter for adjusting the max number of components for PCA.
:param normalize: bool, default=False Flag. If set to True, then divide the nonzero function values for each local decision stump by sqrt(n_samples in node) so that the vector of function values on the training set has unit norm. If False, then do not divide, so that the vector of function values on the training set has norm equal to n_samples in node.
:param random_state: int, default=None Random seed for sample splitting
:param criterion: {"aic", "bic", "cv"}, default="bic" Criterion used for lasso model selection
:param refit: bool, default=True If True, refit OLS after doing lasso model selection to compute r2 values, if not, compute r2 values from the lasso model
:param add_raw: bool, default=True If true, concatenate X_k with the learnt representation from PCA
:param n_splits: int, default=10 The number of splits to use to compute r2f values
Expand source code
class R2F: """ Class to compute R2F feature importance values. :param estimator: scikit-learn estimator, default=RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33) The scikit-learn tree or tree ensemble estimator object :param max_components_type: {"auto", "median_splits", "max_splits", "nsamples", "nstumps", "min_nsamples_nstumps", "min_fracnsamples_nstumps"} or int, default="auto" Method for choosing the max number of components for PCA transformer for each sub-representation corresponding to an original feature: - If "auto", then same settings are used as "min_fracnsamples_nstumps" - If "median_splits", then max_components is alpha * median number of splits on the original feature among trees in the estimator - If "max_splits", then max_components is alpha * maximum number of splits on the original feature among trees in the estimator - If "nsamples", then max_components is alpha * n_samples - If "nstumps", then max_components is alpha * n_stumps - If "min_nsamples_nstumps", then max_components is alpha * min(n_samples, n_stumps), where n_stumps is total number of local decision stumps splitting on that feature in the ensemble - If "min_fracnsamples_nstumps", then max_components is min(alpha * n_samples, n_stumps), where n_stumps is total number of local decision stumps splitting on that feature in the ensemble - If int, then max_components is the given integer :param alpha: float, default=0.5 Parameter for adjusting the max number of components for PCA. :param normalize: bool, default=False Flag. If set to True, then divide the nonzero function values for each local decision stump by sqrt(n_samples in node) so that the vector of function values on the training set has unit norm. If False, then do not divide, so that the vector of function values on the training set has norm equal to n_samples in node. :param random_state: int, default=None Random seed for sample splitting :param criterion: {"aic", "bic", "cv"}, default="bic" Criterion used for lasso model selection :param refit: bool, default=True If True, refit OLS after doing lasso model selection to compute r2 values, if not, compute r2 values from the lasso model :param add_raw: bool, default=True If true, concatenate X_k with the learnt representation from PCA :param n_splits: int, default=10 The number of splits to use to compute r2f values """ def __init__(self, estimator=None, max_components_type="auto", alpha=0.5, normalize=False, random_state=None, criterion="bic", refit=True, add_raw=True, n_splits=10): if estimator is None: self.estimator = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33) else: self.estimator = estimator self.max_components_type = max_components_type self.alpha = alpha self.normalize = normalize self.random_state = random_state self.criterion = criterion self.refit = refit self.add_raw = add_raw self.n_splits = n_splits def get_importance_scores(self, X, y, sample_weight=None, diagnostics=False): """ Compute R2F feature importance values. :param X: array-like of shape (n_samples, n_features) Covariate data matrix :param y: array-like of shape (n_samples,) Vector of responses :param sample_weight: array-like of shape (n_samples,) or None, default=None Sample weights to use in fitting the tree ensemble for feature learning :param diagnostics: bool If False, return only the r2f values. If True, also return the r2 values, number of stumps, and number of PCs chosen in lasso model selection for each feature and over each split :return: r2f_values: array-like of shape (n_features,) The computed r2f values r_squared: array-like of shape (n_splits, n_features) The r2 values (for each feature) over each run n_stumps: array-like of shape (n_splits, n_features) The number of stumps in the ensemble (splitting on each feature) over each run n_components_chosen: array-like of shape (n_splits, n_features) The number of PCs chosen (for each feature) over each run """ n_features = X.shape[1] r_squared = np.zeros((self.n_splits, n_features)) n_components_chosen = np.zeros((self.n_splits, n_features)) n_stumps = np.zeros((self.n_splits, n_features)) if self.n_splits % 2 != 0: raise ValueError("n_splits has to be an even integer") if sample_weight is None: sample_weight = np.ones_like(y) for i in range(self.n_splits // 2): if self.random_state is not None: random_state = self.random_state + i else: random_state = None X_a, X_b, y_a, y_b, sample_weight_a, sample_weight_b = \ train_test_split(X, y, sample_weight, test_size=0.5, random_state=random_state) tree_transformer_a = self._feature_learning_one_split(X_a, y_a, sample_weight_a) r_squared[2*i, :], n_stumps[2*i, :], n_components_chosen[2*i, :] = \ self._model_selection_r2_one_split(tree_transformer_a, X_b, y_b) tree_transformer_b = self._feature_learning_one_split(X_b, y_b, sample_weight_b) r_squared[2*i+1, :], n_stumps[2*i+1, :], n_components_chosen[2*i+1, :] = \ self._model_selection_r2_one_split(tree_transformer_b, X_a, y_a) r2f_values = np.mean(r_squared, axis=0) if diagnostics: return r2f_values, r_squared, n_stumps, n_components_chosen else: return r2f_values def _feature_learning_one_split(self, X_train, y_train, sample_weight=None): """ Step 1 and 2 of r2f: Fit the RF (or other tree ensemble) and learn feature representations from it, storing the information in the TreeTransformer class """ if self.max_components_type == "auto": max_components_type = "min_fracnsamples_nstumps" else: max_components_type = self.max_components_type estimator = copy.deepcopy(self.estimator) estimator.fit(X_train, y_train, sample_weight=sample_weight) tree_transformer = TreeTransformer(estimator=estimator, max_components_type=max_components_type, alpha=self.alpha, normalize=self.normalize) tree_transformer.fit(X_train) return tree_transformer def _model_selection_r2_one_split(self, tree_transformer, X_val, y_val): """ Step 3 of r2f: Do lasso model selection and compute r2 values """ n_features = X_val.shape[1] r_squared = np.zeros(n_features) n_components_chosen = np.zeros(n_features) n_stumps = np.zeros(n_features) for k in range(n_features): X_transformed = tree_transformer.transform_one_feature(X_val, k) n_stumps[k] = len(tree_transformer.get_stumps_for_feature(k)) if X_transformed is None: r_squared[k] = 0 n_components_chosen[k] = 0 else: y_val_centered = y_val - np.mean(y_val) if self.add_raw: X_transformed = np.hstack([X_val[:, [k]] - np.mean(X_val[:, k]), X_transformed]) with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.criterion == "cv": lasso = LassoCV(fit_intercept=False, normalize=False) else: lasso = LassoLarsIC(criterion=self.criterion, normalize=False, fit_intercept=False) lasso.fit(X_transformed, y_val_centered) n_components_chosen[k] = np.count_nonzero(lasso.coef_) if self.refit: support = np.nonzero(lasso.coef_)[0] if len(support) == 0: r_squared[k] = 0.0 else: lr = LinearRegression().fit(X_transformed[:, support], y_val_centered) r_squared[k] = lr.score(X_transformed[:, support], y_val_centered) else: r_squared[k] = lasso.score(X_transformed, y_val_centered) return r_squared, n_stumps, n_components_chosen
Methods
def get_importance_scores(self, X, y, sample_weight=None, diagnostics=False)
-
Compute R2F feature importance values.
:param X: array-like of shape (n_samples, n_features) Covariate data matrix :param y: array-like of shape (n_samples,) Vector of responses :param sample_weight: array-like of shape (n_samples,) or None, default=None Sample weights to use in fitting the tree ensemble for feature learning :param diagnostics: bool If False, return only the r2f values. If True, also return the r2 values, number of stumps, and number of PCs chosen in lasso model selection for each feature and over each split :return: r2f_values: array-like of shape (n_features,) The computed r2f values r_squared: array-like of shape (n_splits, n_features) The r2 values (for each feature) over each run n_stumps: array-like of shape (n_splits, n_features) The number of stumps in the ensemble (splitting on each feature) over each run n_components_chosen: array-like of shape (n_splits, n_features) The number of PCs chosen (for each feature) over each run
Expand source code
def get_importance_scores(self, X, y, sample_weight=None, diagnostics=False): """ Compute R2F feature importance values. :param X: array-like of shape (n_samples, n_features) Covariate data matrix :param y: array-like of shape (n_samples,) Vector of responses :param sample_weight: array-like of shape (n_samples,) or None, default=None Sample weights to use in fitting the tree ensemble for feature learning :param diagnostics: bool If False, return only the r2f values. If True, also return the r2 values, number of stumps, and number of PCs chosen in lasso model selection for each feature and over each split :return: r2f_values: array-like of shape (n_features,) The computed r2f values r_squared: array-like of shape (n_splits, n_features) The r2 values (for each feature) over each run n_stumps: array-like of shape (n_splits, n_features) The number of stumps in the ensemble (splitting on each feature) over each run n_components_chosen: array-like of shape (n_splits, n_features) The number of PCs chosen (for each feature) over each run """ n_features = X.shape[1] r_squared = np.zeros((self.n_splits, n_features)) n_components_chosen = np.zeros((self.n_splits, n_features)) n_stumps = np.zeros((self.n_splits, n_features)) if self.n_splits % 2 != 0: raise ValueError("n_splits has to be an even integer") if sample_weight is None: sample_weight = np.ones_like(y) for i in range(self.n_splits // 2): if self.random_state is not None: random_state = self.random_state + i else: random_state = None X_a, X_b, y_a, y_b, sample_weight_a, sample_weight_b = \ train_test_split(X, y, sample_weight, test_size=0.5, random_state=random_state) tree_transformer_a = self._feature_learning_one_split(X_a, y_a, sample_weight_a) r_squared[2*i, :], n_stumps[2*i, :], n_components_chosen[2*i, :] = \ self._model_selection_r2_one_split(tree_transformer_a, X_b, y_b) tree_transformer_b = self._feature_learning_one_split(X_b, y_b, sample_weight_b) r_squared[2*i+1, :], n_stumps[2*i+1, :], n_components_chosen[2*i+1, :] = \ self._model_selection_r2_one_split(tree_transformer_b, X_a, y_a) r2f_values = np.mean(r_squared, axis=0) if diagnostics: return r2f_values, r_squared, n_stumps, n_components_chosen else: return r2f_values