Module imodels.tree.optimal_classification_tree.example

Expand source code
import logging

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer as load_data
from sklearn.tree import DecisionTreeClassifier

from .optree import OptimalHyperTreeModel

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', )

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


def run_example(depth: int = 2):
    features, label = load_data(return_X_y=True)
    p = features.shape[1]
    column_names = ["x{0}".format(i) for i in range(p)]
    data = pd.DataFrame(data=features, columns=column_names)
    data["label"] = label

    test_indices = np.random.random_integers(0, data.shape[0]-1, size=(int(data.shape[0] * 0.2), ))
    train_indices = [i for i in range(0, data.shape[0]) if i not in test_indices]
    train = data.iloc[train_indices, ].reset_index()
    test = data.iloc[test_indices, ].reset_index()

    print(train.shape)

    # Use sklearn
    train_features_sklearn = features[train_indices, ::]
    train_label_sklearn = label[train_indices]
    test_features_sklearn = features[test_indices, ::]
    test_label_sklearn = label[test_indices]
    cart_model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=1)
    clf = cart_model.fit(train_features_sklearn, train_label_sklearn)
    predicted_y = clf.predict(test_features_sklearn)

    # Use PyOptree
    model = OptimalHyperTreeModel(column_names, "label", tree_depth=depth, N_min=1)
    model.train(train, train_method="mio")

    test = model.predict(test)

    print(model.a)

    print("PyOptree Library Tree Prediction Accuracy: {}".format(sum(test["prediction"]==test["label"]) / len(test["label"])))

    print("SKLearn Library Tree Prediction Accuracy: {}".format(sum(predicted_y==test_label_sklearn) / len(test_label_sklearn)))


if __name__ == "__main__":
    run_example(depth=3)

Functions

def run_example(depth=2)
Expand source code
def run_example(depth: int = 2):
    features, label = load_data(return_X_y=True)
    p = features.shape[1]
    column_names = ["x{0}".format(i) for i in range(p)]
    data = pd.DataFrame(data=features, columns=column_names)
    data["label"] = label

    test_indices = np.random.random_integers(0, data.shape[0]-1, size=(int(data.shape[0] * 0.2), ))
    train_indices = [i for i in range(0, data.shape[0]) if i not in test_indices]
    train = data.iloc[train_indices, ].reset_index()
    test = data.iloc[test_indices, ].reset_index()

    print(train.shape)

    # Use sklearn
    train_features_sklearn = features[train_indices, ::]
    train_label_sklearn = label[train_indices]
    test_features_sklearn = features[test_indices, ::]
    test_label_sklearn = label[test_indices]
    cart_model = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=1)
    clf = cart_model.fit(train_features_sklearn, train_label_sklearn)
    predicted_y = clf.predict(test_features_sklearn)

    # Use PyOptree
    model = OptimalHyperTreeModel(column_names, "label", tree_depth=depth, N_min=1)
    model.train(train, train_method="mio")

    test = model.predict(test)

    print(model.a)

    print("PyOptree Library Tree Prediction Accuracy: {}".format(sum(test["prediction"]==test["label"]) / len(test["label"])))

    print("SKLearn Library Tree Prediction Accuracy: {}".format(sum(predicted_y==test_label_sklearn) / len(test_label_sklearn)))