Module imodels.util.score
Expand source code
from typing import List, Tuple
from warnings import warn
import pandas as pd
import numpy as np
from sklearn.utils import indices_to_mask
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegressionCV
from imodels.util.rule import Rule
def score_oob(X,
y,
rules: List[List[str]],
samples: List[List[int]],
features: List[List[int]],
feature_names: List[str]) -> List[Rule]:
scored_rules = []
for curr_rules, curr_samples, curr_features in zip(rules, samples, features):
# Create mask for OOB samples
mask = ~indices_to_mask(curr_samples, X.shape[0])
if sum(mask) == 0:
warn(
"OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong"
" (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1."
)
mask = curr_samples
# XXX todo: idem without dataframe
X_oob = pd.DataFrame(
(X[mask, :])[:, curr_features],
columns=np.array(feature_names)[curr_features]
)
if X_oob.shape[1] <= 1: # otherwise pandas bug (cf. issue #16363)
return []
y_oob = y[mask]
y_oob = np.array((y_oob != 0))
# Add OOB performances to rules:
scored_rules += [
Rule(r, args=_eval_rule_perf(r, X_oob, y_oob)) for r in set(curr_rules)
]
return scored_rules
def _eval_rule_perf(rule, X, y) -> Tuple[float, float]:
detected_index = list(X.query(rule).index)
if len(detected_index) <= 1:
return (0, 0)
y_detected = y[detected_index]
true_pos = y_detected[y_detected > 0].sum()
if true_pos == 0:
return (0, 0)
pos = y[y > 0].sum()
return y_detected.mean(), float(true_pos) / pos
def score_lasso(X, y, rules: List[str], Cs, cv, random_state) -> Tuple[List[Rule], LassoCV]:
if Cs is None:
n_alphas = 100
alphas = None
elif hasattr(Cs, "__len__"):
n_alphas = None
alphas = 1. / Cs
else:
n_alphas = Cs
alphas = None
lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=cv, random_state=random_state)
lscv.fit(X, y)
rules = [Rule(r, args=[w]) for r, w in zip(rules, lscv.coef_[-len(rules):])]
return rules, lscv
Functions
def score_lasso(X, y, rules, Cs, cv, random_state)
-
Expand source code
def score_lasso(X, y, rules: List[str], Cs, cv, random_state) -> Tuple[List[Rule], LassoCV]: if Cs is None: n_alphas = 100 alphas = None elif hasattr(Cs, "__len__"): n_alphas = None alphas = 1. / Cs else: n_alphas = Cs alphas = None lscv = LassoCV(n_alphas=n_alphas, alphas=alphas, cv=cv, random_state=random_state) lscv.fit(X, y) rules = [Rule(r, args=[w]) for r, w in zip(rules, lscv.coef_[-len(rules):])] return rules, lscv
def score_oob(X, y, rules, samples, features, feature_names)
-
Expand source code
def score_oob(X, y, rules: List[List[str]], samples: List[List[int]], features: List[List[int]], feature_names: List[str]) -> List[Rule]: scored_rules = [] for curr_rules, curr_samples, curr_features in zip(rules, samples, features): # Create mask for OOB samples mask = ~indices_to_mask(curr_samples, X.shape[0]) if sum(mask) == 0: warn( "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong" " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1." ) mask = curr_samples # XXX todo: idem without dataframe X_oob = pd.DataFrame( (X[mask, :])[:, curr_features], columns=np.array(feature_names)[curr_features] ) if X_oob.shape[1] <= 1: # otherwise pandas bug (cf. issue #16363) return [] y_oob = y[mask] y_oob = np.array((y_oob != 0)) # Add OOB performances to rules: scored_rules += [ Rule(r, args=_eval_rule_perf(r, X_oob, y_oob)) for r in set(curr_rules) ] return scored_rules