csinva
/
imodels
mirror of https://github.com/csinva/imodels


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
            import os
import unittest

import numpy as np
import pandas as pd
from scipy.io.arff import loadarff
from sklearn.model_selection import train_test_split

from imodels.rule_list.bayesian_rule_list.bayesian_rule_list import BayesianRuleListClassifier
from imodels.discretization import ExtraBasicDiscretizer

path_to_tests = os.path.dirname(os.path.realpath(__file__))


class TestBRL(unittest.TestCase):

    def test_integration_stability(self):
        '''Test on synthetic dataset
        '''
        X = np.array([[0, 0, 1, 1, 0],
                      [1, 0, 0, 0, 0],
                      [0, 0, 1, 0, 0],
                      [1, 0, 0, 0, 0],
                      [1, 1, 0, 1, 1],
                      [1, 1, 1, 1, 1],
                      [0, 1, 1, 1, 1],
                      [1, 0, 1, 1, 1]])
        y = np.array([0, 0, 0, 0, 1, 1, 1, 1])
        M = BayesianRuleListClassifier(minsupport=0.02, maxcardinality=1)
        feat = ['ft1', 'ft2', 'ft3', 'ft4', 'ft5']
        M.fit(X, y, feature_names=feat)
        assert (np.array([M.predict(np.array([row]), threshold=0.5)
                for row in X]).flatten() == y).all()

    # def test_integration_fitting(self):
    #     '''Test on a real (small) dataset
    #     '''
    #     np.random.seed(13)
    #     feature_names = ["#Pregnant", "Glucose concentration test", "Blood pressure(mmHg)",
    #                      "Triceps skin fold thickness(mm)",
    #                      "2-Hour serum insulin (mu U/ml)", "Body mass index", "Diabetes pedigree function",
    #                      "Age (years)"]

    #     data = loadarff(os.path.join(path_to_tests, "test_data/diabetes.arff"))
    #     data_np = np.array(list(map(lambda x: np.array(list(x)), data[0])))
    #     X, y_text = data_np[:, :-1].astype('float32'), data_np[:, -1].astype('str')
    #     y = (y_text == 'tested_positive').astype(int)  # labels 0-1

    #     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)  # split
    #     disc = ExtraBasicDiscretizer(feature_names, n_bins=3, strategy='uniform')
    #     X_train_disc = disc.fit_transform(pd.DataFrame(X_train, columns=feature_names))
    #     X_test_disc = disc.transform(pd.DataFrame(X_test, columns=feature_names))

    #     # train classifier (allow more iterations for better accuracy; use BigDataRuleListClassifier for large datasets)
    #     print('training...')
    #     model = BayesianRuleListClassifier(max_iter=300, minsupport=0.4, maxcardinality=1, class1label="diabetes",
    #                                        verbose=False)
    #     model.fit(X_train_disc.values, y_train, feature_names=X_train_disc.columns)
    #     preds = model.predict(X_test_disc.values, threshold=0.1)
    #     print("RuleListClassifier Accuracy:", np.mean(y_test == preds), "Learned interpretable model:\n", model)