Module imodels.bayesian_rule_list.RuleListClassifier
Expand source code
from sklearn.base import BaseEstimator
import sklearn.metrics
import sys
import numpy as np
import pandas as pd
from .brl import *
from .discretization.MDLP import *
import numbers
import random
class RuleListClassifier(BaseEstimator):
"""
This is a scikit-learn compatible wrapper for the Bayesian Rule List
classifier developed by Benjamin Letham. It produces a highly
interpretable model (a list of decision rules) of the same form as
an expert system.
Parameters
----------
listlengthprior : int, optional (default=3)
Prior hyperparameter for expected list length (excluding null rule)
listwidthprior : int, optional (default=1)
Prior hyperparameter for expected list width (excluding null rule)
maxcardinality : int, optional (default=2)
Maximum cardinality of an itemset
minsupport : int, optional (default=10)
Minimum support (%) of an itemset
alpha : array_like, shape = [n_classes]
prior hyperparameter for multinomial pseudocounts
n_chains : int, optional (default=3)
Number of MCMC chains for inference
max_iter : int, optional (default=50000)
Maximum number of iterations
class1label: str, optional (default="class 1")
Label or description of what the positive class (with y=1) means
verbose: bool, optional (default=True)
Verbose output
random_state: int
Random seed
"""
def __init__(self, listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True, random_state=42):
self.listlengthprior = listlengthprior
self.listwidthprior = listwidthprior
self.maxcardinality = maxcardinality
self.minsupport = minsupport
self.alpha = alpha
self.n_chains = n_chains
self.max_iter = max_iter
self.class1label = class1label
self.verbose = verbose
self._zmin = 1
self.thinning = 1 #The thinning rate
self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation
self.discretizer = None
self.d_star = None
self.random_state = random_state
self.seed()
def seed(self):
if self.random_state is not None:
random.seed(self.random_state)
np.random.seed(self.random_state)
def _setlabels(self, X, feature_labels=[]):
if len(feature_labels) == 0:
if type(X) == pd.DataFrame and ('object' in str(X.columns.dtype) or 'str' in str(X.columns.dtype)):
feature_labels = X.columns
else:
feature_labels = ["ft"+str(i+1) for i in range(len(X[0]))]
self.feature_labels = feature_labels
def _discretize_mixed_data(self, X, y, undiscretized_features=[]):
if type(X) != list:
X = np.array(X).tolist()
# check which features are numeric (to be discretized)
self.discretized_features = []
for fi in range(len(X[0])):
# if not string, and not specified as undiscretized
if isinstance(X[0][fi], numbers.Number) and (len(self.feature_labels)==0 or len(undiscretized_features)==0 or self.feature_labels[fi] not in undiscretized_features):
self.discretized_features.append(self.feature_labels[fi])
if len(self.discretized_features) > 0:
if self.verbose:
print("Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to strings, and/or specify the argument 'undiscretized_features', to avoid this.)")
X = self.discretize(X, y)
return X
def _setdata(self, X, y, feature_labels=[], undiscretized_features = []):
self._setlabels(X, feature_labels)
X = self._discretize_mixed_data(X, y, undiscretized_features)
return X, y
def fit(self, X, y, feature_labels = [], undiscretized_features = [], verbose=False):
"""Fit rule lists to data
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training data
y : array_like, shape = [n_samples]
Labels
feature_labels : array_like, shape = [n_features], optional (default: [])
String labels for each feature. If empty and X is a DataFrame, column
labels are used. If empty and X is not a DataFrame, then features are
simply enumerated
undiscretized_features : array_like, shape = [n_features], optional (default: [])
String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized
verbose : bool
Currently doesn't do anything
Returns
-------
self : returns an instance of self.
"""
self.seed()
if len(set(y)) != 2:
raise Exception("Only binary classification is supported at this time!")
# deal with pandas data
if type(X) in [pd.DataFrame, pd.Series]:
X = X.values
if type(y) in [pd.DataFrame, pd.Series]:
y = y.values
X, y = self._setdata(X, y, feature_labels, undiscretized_features)
permsdic = defaultdict(default_permsdic) #We will store here the MCMC results
data = list(X[:])
#Now find frequent itemsets
#Mine separately for each class
data_pos = [x for i,x in enumerate(data) if y[i]==0]
data_neg = [x for i,x in enumerate(data) if y[i]==1]
assert len(data_pos)+len(data_neg) == len(data)
try:
itemsets = [r[0] for r in fpgrowth(data_pos,supp=self.minsupport,zmin=self._zmin,zmax=self.maxcardinality)]
itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=self.minsupport,zmin=self._zmin,zmax=self.maxcardinality)])
except TypeError:
itemsets = [r[0] for r in fpgrowth(data_pos,supp=self.minsupport,min=self._zmin,max=self.maxcardinality)]
itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=self.minsupport,min=self._zmin,max=self.maxcardinality)])
itemsets = list(set(itemsets))
if self.verbose:
print(len(itemsets),'rules mined')
#Now form the data-vs.-lhs set
#X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
X = [ set() for j in range(len(itemsets)+1)]
X[0] = set(range(len(data))) #the default rule satisfies all data
for (j,lhs) in enumerate(itemsets):
X[j+1] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)])
#now form lhs_len
lhs_len = [0]
for lhs in itemsets:
lhs_len.append(len(lhs))
nruleslen = Counter(lhs_len)
lhs_len = array(lhs_len)
itemsets_all = ['null']
itemsets_all.extend(itemsets)
Xtrain,Ytrain,nruleslen,lhs_len,self.itemsets = (X,np.vstack((1-np.array(y), y)).T.astype(int),nruleslen,lhs_len,itemsets_all)
#Do MCMC
res,Rhat = run_bdl_multichain_serial(self.max_iter,self.thinning,self.alpha,self.listlengthprior,self.listwidthprior,Xtrain,Ytrain,nruleslen,lhs_len,self.maxcardinality,permsdic,self.burnin,self.n_chains,[None]*self.n_chains, verbose=self.verbose, seed=self.random_state)
#Merge the chains
permsdic = merge_chains(res)
###The point estimate, BRL-point
self.d_star = get_point_estimate(permsdic,lhs_len,Xtrain,Ytrain,self.alpha,nruleslen,self.maxcardinality,self.listlengthprior,self.listwidthprior, verbose=self.verbose) #get the point estimate
if self.d_star:
#Compute the rule consequent
self.theta, self.ci_theta = get_rule_rhs(Xtrain,Ytrain,self.d_star,self.alpha,True)
return self
def discretize(self, X, y):
if self.verbose:
print("Discretizing ", self.discretized_features, "...")
D = pd.DataFrame(np.hstack(( X, np.array(y).reshape((len(y), 1)) )), columns=list(self.feature_labels)+["y"])
self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features)
cat_data = pd.DataFrame(np.zeros_like(X))
for i in range(len(self.feature_labels)):
label = self.feature_labels[i]
if label in self.discretized_features:
column = []
for j in range(len(self.discretizer._data[label])):
column += [label + " : " + self.discretizer._data[label][j]]
cat_data.iloc[:, i] = np.array(column)
else:
cat_data.iloc[:, i] = D[label]
return np.array(cat_data).tolist()
def _prepend_feature_labels(self, X):
Xl = np.copy(X).astype(str).tolist()
for i in range(len(Xl)):
for j in range(len(Xl[0])):
Xl[i][j] = self.feature_labels[j]+" : "+Xl[i][j]
return Xl
def __str__(self):
return self.tostring(decimals=1)
def tostring(self, decimals=1):
if self.d_star:
detect = ""
if self.class1label != "class 1":
detect = "for detecting "+self.class1label
header = "Trained RuleListClassifier "+detect+"\n"
separator = "".join(["="]*len(header))+"\n"
s = ""
for i,j in enumerate(self.d_star):
if self.itemsets[j] != 'null':
condition = "ELSE IF "+(" AND ".join([str(self.itemsets[j][k]) for k in range(len(self.itemsets[j]))])) + " THEN"
else:
condition = "ELSE"
s += condition + " probability of "+self.class1label+": "+str(np.round(self.theta[i]*100,decimals)) + "% ("+str(np.round(self.ci_theta[i][0]*100,decimals))+"%-"+str(np.round(self.ci_theta[i][1]*100,decimals))+"%)\n"
return header+separator+s[5:]+separator[1:]
else:
return "(Untrained RuleListClassifier)"
def _to_itemset_indices(self, data):
#X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
X = [set() for j in range(len(self.itemsets))]
X[0] = set(range(len(data))) #the default rule satisfies all data
for (j,lhs) in enumerate(self.itemsets):
if j>0:
X[j] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)])
return X
def predict_proba(self, X):
"""Compute probabilities of possible outcomes for samples in X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
T : array-like, shape = [n_samples, n_classes]
Returns the probability of the sample for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute `classes_`.
"""
# deal with pandas data
if type(X) in [pd.DataFrame, pd.Series]:
X = X.values
if self.discretizer != None:
self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels)
self.discretizer.apply_cutpoints()
D = self._prepend_feature_labels(np.array(self.discretizer._data)[:, :-1])
else:
D = X
N = len(D)
X2 = self._to_itemset_indices(D[:])
P = preds_d_t(X2, np.zeros((N, 1), dtype=int),self.d_star,self.theta)
return np.vstack((1-P, P)).T
def predict(self, X):
"""Perform classification on samples in X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
y_pred : array, shape = [n_samples]
Class labels for samples in X.
"""
# deal with pandas data
if type(X) in [pd.DataFrame, pd.Series]:
X = X.values
return 1*(self.predict_proba(X)[:,1]>=0.5)
def score(self, X, y, sample_weight=None):
return sklearn.metrics.accuracy_score(y, self.predict(X), sample_weight=sample_weight)
Classes
class RuleListClassifier (listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha=array([1., 1.]), n_chains=3, max_iter=50000, class1label='class 1', verbose=True, random_state=42)
-
This is a scikit-learn compatible wrapper for the Bayesian Rule List classifier developed by Benjamin Letham. It produces a highly interpretable model (a list of decision rules) of the same form as an expert system.
Parameters
listlengthprior
:int
, optional (default=3
)- Prior hyperparameter for expected list length (excluding null rule)
listwidthprior
:int
, optional (default=1
)- Prior hyperparameter for expected list width (excluding null rule)
maxcardinality
:int
, optional (default=2
)- Maximum cardinality of an itemset
minsupport
:int
, optional (default=10
)- Minimum support (%) of an itemset
alpha
:array_like
,shape
= [n_classes
]- prior hyperparameter for multinomial pseudocounts
n_chains
:int
, optional (default=3
)- Number of MCMC chains for inference
max_iter
:int
, optional (default=50000
)- Maximum number of iterations
class1label
:str
, optional (default="class
1"
)- Label or description of what the positive class (with y=1) means
verbose
:bool
, optional (default=True
)- Verbose output
random_state
:int
- Random seed
Expand source code
class RuleListClassifier(BaseEstimator): """ This is a scikit-learn compatible wrapper for the Bayesian Rule List classifier developed by Benjamin Letham. It produces a highly interpretable model (a list of decision rules) of the same form as an expert system. Parameters ---------- listlengthprior : int, optional (default=3) Prior hyperparameter for expected list length (excluding null rule) listwidthprior : int, optional (default=1) Prior hyperparameter for expected list width (excluding null rule) maxcardinality : int, optional (default=2) Maximum cardinality of an itemset minsupport : int, optional (default=10) Minimum support (%) of an itemset alpha : array_like, shape = [n_classes] prior hyperparameter for multinomial pseudocounts n_chains : int, optional (default=3) Number of MCMC chains for inference max_iter : int, optional (default=50000) Maximum number of iterations class1label: str, optional (default="class 1") Label or description of what the positive class (with y=1) means verbose: bool, optional (default=True) Verbose output random_state: int Random seed """ def __init__(self, listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True, random_state=42): self.listlengthprior = listlengthprior self.listwidthprior = listwidthprior self.maxcardinality = maxcardinality self.minsupport = minsupport self.alpha = alpha self.n_chains = n_chains self.max_iter = max_iter self.class1label = class1label self.verbose = verbose self._zmin = 1 self.thinning = 1 #The thinning rate self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation self.discretizer = None self.d_star = None self.random_state = random_state self.seed() def seed(self): if self.random_state is not None: random.seed(self.random_state) np.random.seed(self.random_state) def _setlabels(self, X, feature_labels=[]): if len(feature_labels) == 0: if type(X) == pd.DataFrame and ('object' in str(X.columns.dtype) or 'str' in str(X.columns.dtype)): feature_labels = X.columns else: feature_labels = ["ft"+str(i+1) for i in range(len(X[0]))] self.feature_labels = feature_labels def _discretize_mixed_data(self, X, y, undiscretized_features=[]): if type(X) != list: X = np.array(X).tolist() # check which features are numeric (to be discretized) self.discretized_features = [] for fi in range(len(X[0])): # if not string, and not specified as undiscretized if isinstance(X[0][fi], numbers.Number) and (len(self.feature_labels)==0 or len(undiscretized_features)==0 or self.feature_labels[fi] not in undiscretized_features): self.discretized_features.append(self.feature_labels[fi]) if len(self.discretized_features) > 0: if self.verbose: print("Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to strings, and/or specify the argument 'undiscretized_features', to avoid this.)") X = self.discretize(X, y) return X def _setdata(self, X, y, feature_labels=[], undiscretized_features = []): self._setlabels(X, feature_labels) X = self._discretize_mixed_data(X, y, undiscretized_features) return X, y def fit(self, X, y, feature_labels = [], undiscretized_features = [], verbose=False): """Fit rule lists to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_labels : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values if type(y) in [pd.DataFrame, pd.Series]: y = y.values X, y = self._setdata(X, y, feature_labels, undiscretized_features) permsdic = defaultdict(default_permsdic) #We will store here the MCMC results data = list(X[:]) #Now find frequent itemsets #Mine separately for each class data_pos = [x for i,x in enumerate(data) if y[i]==0] data_neg = [x for i,x in enumerate(data) if y[i]==1] assert len(data_pos)+len(data_neg) == len(data) try: itemsets = [r[0] for r in fpgrowth(data_pos,supp=self.minsupport,zmin=self._zmin,zmax=self.maxcardinality)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=self.minsupport,zmin=self._zmin,zmax=self.maxcardinality)]) except TypeError: itemsets = [r[0] for r in fpgrowth(data_pos,supp=self.minsupport,min=self._zmin,max=self.maxcardinality)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=self.minsupport,min=self._zmin,max=self.maxcardinality)]) itemsets = list(set(itemsets)) if self.verbose: print(len(itemsets),'rules mined') #Now form the data-vs.-lhs set #X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [ set() for j in range(len(itemsets)+1)] X[0] = set(range(len(data))) #the default rule satisfies all data for (j,lhs) in enumerate(itemsets): X[j+1] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain,Ytrain,nruleslen,lhs_len,self.itemsets = (X,np.vstack((1-np.array(y), y)).T.astype(int),nruleslen,lhs_len,itemsets_all) #Do MCMC res,Rhat = run_bdl_multichain_serial(self.max_iter,self.thinning,self.alpha,self.listlengthprior,self.listwidthprior,Xtrain,Ytrain,nruleslen,lhs_len,self.maxcardinality,permsdic,self.burnin,self.n_chains,[None]*self.n_chains, verbose=self.verbose, seed=self.random_state) #Merge the chains permsdic = merge_chains(res) ###The point estimate, BRL-point self.d_star = get_point_estimate(permsdic,lhs_len,Xtrain,Ytrain,self.alpha,nruleslen,self.maxcardinality,self.listlengthprior,self.listwidthprior, verbose=self.verbose) #get the point estimate if self.d_star: #Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain,Ytrain,self.d_star,self.alpha,True) return self def discretize(self, X, y): if self.verbose: print("Discretizing ", self.discretized_features, "...") D = pd.DataFrame(np.hstack(( X, np.array(y).reshape((len(y), 1)) )), columns=list(self.feature_labels)+["y"]) self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features) cat_data = pd.DataFrame(np.zeros_like(X)) for i in range(len(self.feature_labels)): label = self.feature_labels[i] if label in self.discretized_features: column = [] for j in range(len(self.discretizer._data[label])): column += [label + " : " + self.discretizer._data[label][j]] cat_data.iloc[:, i] = np.array(column) else: cat_data.iloc[:, i] = D[label] return np.array(cat_data).tolist() def _prepend_feature_labels(self, X): Xl = np.copy(X).astype(str).tolist() for i in range(len(Xl)): for j in range(len(Xl[0])): Xl[i][j] = self.feature_labels[j]+" : "+Xl[i][j] return Xl def __str__(self): return self.tostring(decimals=1) def tostring(self, decimals=1): if self.d_star: detect = "" if self.class1label != "class 1": detect = "for detecting "+self.class1label header = "Trained RuleListClassifier "+detect+"\n" separator = "".join(["="]*len(header))+"\n" s = "" for i,j in enumerate(self.d_star): if self.itemsets[j] != 'null': condition = "ELSE IF "+(" AND ".join([str(self.itemsets[j][k]) for k in range(len(self.itemsets[j]))])) + " THEN" else: condition = "ELSE" s += condition + " probability of "+self.class1label+": "+str(np.round(self.theta[i]*100,decimals)) + "% ("+str(np.round(self.ci_theta[i][0]*100,decimals))+"%-"+str(np.round(self.ci_theta[i][1]*100,decimals))+"%)\n" return header+separator+s[5:]+separator[1:] else: return "(Untrained RuleListClassifier)" def _to_itemset_indices(self, data): #X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [set() for j in range(len(self.itemsets))] X[0] = set(range(len(data))) #the default rule satisfies all data for (j,lhs) in enumerate(self.itemsets): if j>0: X[j] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)]) return X def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values if self.discretizer != None: self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels) self.discretizer.apply_cutpoints() D = self._prepend_feature_labels(np.array(self.discretizer._data)[:, :-1]) else: D = X N = len(D) X2 = self._to_itemset_indices(D[:]) P = preds_d_t(X2, np.zeros((N, 1), dtype=int),self.d_star,self.theta) return np.vstack((1-P, P)).T def predict(self, X): """Perform classification on samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- y_pred : array, shape = [n_samples] Class labels for samples in X. """ # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values return 1*(self.predict_proba(X)[:,1]>=0.5) def score(self, X, y, sample_weight=None): return sklearn.metrics.accuracy_score(y, self.predict(X), sample_weight=sample_weight)
Ancestors
Methods
def discretize(self, X, y)
-
Expand source code
def discretize(self, X, y): if self.verbose: print("Discretizing ", self.discretized_features, "...") D = pd.DataFrame(np.hstack(( X, np.array(y).reshape((len(y), 1)) )), columns=list(self.feature_labels)+["y"]) self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features) cat_data = pd.DataFrame(np.zeros_like(X)) for i in range(len(self.feature_labels)): label = self.feature_labels[i] if label in self.discretized_features: column = [] for j in range(len(self.discretizer._data[label])): column += [label + " : " + self.discretizer._data[label][j]] cat_data.iloc[:, i] = np.array(column) else: cat_data.iloc[:, i] = D[label] return np.array(cat_data).tolist()
def fit(self, X, y, feature_labels=[], undiscretized_features=[], verbose=False)
-
Fit rule lists to data
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]- Training data
y
:array_like
,shape
= [n_samples
]- Labels
feature_labels
:array_like
,shape
= [n_features
], optional (default: [])- String labels for each feature. If empty and X is a DataFrame, column
labels are used. If empty and X is not a DataFrame, then features are
simply enumerated undiscretized_features
:array_like
,shape
= [n_features
], optional (default: [])- String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized
verbose
:bool
- Currently doesn't do anything
Returns
self : returns an instance of self.
Expand source code
def fit(self, X, y, feature_labels = [], undiscretized_features = [], verbose=False): """Fit rule lists to data Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_labels : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values if type(y) in [pd.DataFrame, pd.Series]: y = y.values X, y = self._setdata(X, y, feature_labels, undiscretized_features) permsdic = defaultdict(default_permsdic) #We will store here the MCMC results data = list(X[:]) #Now find frequent itemsets #Mine separately for each class data_pos = [x for i,x in enumerate(data) if y[i]==0] data_neg = [x for i,x in enumerate(data) if y[i]==1] assert len(data_pos)+len(data_neg) == len(data) try: itemsets = [r[0] for r in fpgrowth(data_pos,supp=self.minsupport,zmin=self._zmin,zmax=self.maxcardinality)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=self.minsupport,zmin=self._zmin,zmax=self.maxcardinality)]) except TypeError: itemsets = [r[0] for r in fpgrowth(data_pos,supp=self.minsupport,min=self._zmin,max=self.maxcardinality)] itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=self.minsupport,min=self._zmin,max=self.maxcardinality)]) itemsets = list(set(itemsets)) if self.verbose: print(len(itemsets),'rules mined') #Now form the data-vs.-lhs set #X[j] is the set of data points that contain itemset j (that is, satisfy rule j) X = [ set() for j in range(len(itemsets)+1)] X[0] = set(range(len(data))) #the default rule satisfies all data for (j,lhs) in enumerate(itemsets): X[j+1] = set([i for (i,xi) in enumerate(data) if set(lhs).issubset(xi)]) #now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain,Ytrain,nruleslen,lhs_len,self.itemsets = (X,np.vstack((1-np.array(y), y)).T.astype(int),nruleslen,lhs_len,itemsets_all) #Do MCMC res,Rhat = run_bdl_multichain_serial(self.max_iter,self.thinning,self.alpha,self.listlengthprior,self.listwidthprior,Xtrain,Ytrain,nruleslen,lhs_len,self.maxcardinality,permsdic,self.burnin,self.n_chains,[None]*self.n_chains, verbose=self.verbose, seed=self.random_state) #Merge the chains permsdic = merge_chains(res) ###The point estimate, BRL-point self.d_star = get_point_estimate(permsdic,lhs_len,Xtrain,Ytrain,self.alpha,nruleslen,self.maxcardinality,self.listlengthprior,self.listwidthprior, verbose=self.verbose) #get the point estimate if self.d_star: #Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain,Ytrain,self.d_star,self.alpha,True) return self
def predict(self, X)
-
Perform classification on samples in X.
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]
Returns
y_pred
:array
,shape
= [n_samples
]- Class labels for samples in X.
Expand source code
def predict(self, X): """Perform classification on samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- y_pred : array, shape = [n_samples] Class labels for samples in X. """ # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values return 1*(self.predict_proba(X)[:,1]>=0.5)
def predict_proba(self, X)
-
Compute probabilities of possible outcomes for samples in X.
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]
Returns
T
:array
-like
,shape
= [n_samples
,n_classes
]- Returns the probability of the sample for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute
classes_
.
Expand source code
def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ # deal with pandas data if type(X) in [pd.DataFrame, pd.Series]: X = X.values if self.discretizer != None: self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels) self.discretizer.apply_cutpoints() D = self._prepend_feature_labels(np.array(self.discretizer._data)[:, :-1]) else: D = X N = len(D) X2 = self._to_itemset_indices(D[:]) P = preds_d_t(X2, np.zeros((N, 1), dtype=int),self.d_star,self.theta) return np.vstack((1-P, P)).T
def score(self, X, y, sample_weight=None)
-
Expand source code
def score(self, X, y, sample_weight=None): return sklearn.metrics.accuracy_score(y, self.predict(X), sample_weight=sample_weight)
def seed(self)
-
Expand source code
def seed(self): if self.random_state is not None: random.seed(self.random_state) np.random.seed(self.random_state)
def tostring(self, decimals=1)
-
Expand source code
def tostring(self, decimals=1): if self.d_star: detect = "" if self.class1label != "class 1": detect = "for detecting "+self.class1label header = "Trained RuleListClassifier "+detect+"\n" separator = "".join(["="]*len(header))+"\n" s = "" for i,j in enumerate(self.d_star): if self.itemsets[j] != 'null': condition = "ELSE IF "+(" AND ".join([str(self.itemsets[j][k]) for k in range(len(self.itemsets[j]))])) + " THEN" else: condition = "ELSE" s += condition + " probability of "+self.class1label+": "+str(np.round(self.theta[i]*100,decimals)) + "% ("+str(np.round(self.ci_theta[i][0]*100,decimals))+"%-"+str(np.round(self.ci_theta[i][1]*100,decimals))+"%)\n" return header+separator+s[5:]+separator[1:] else: return "(Untrained RuleListClassifier)"