1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
- from imodels import OptimalTreeClassifier
- from imodels.util.data_util import get_clean_dataset
- DATASETS_CLASSIFICATION = [
- # classification datasets from original random forests paper
- # page 9: https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf
- # ("sonar", "sonar", "pmlb"),
- # ("heart", "heart", 'imodels'),
- # ("breast-cancer", "breast_cancer", 'imodels'),
- # ("haberman", "haberman", 'imodels'),
- ("ionosphere", "ionosphere", 'pmlb'),
- ("diabetes", "diabetes", "pmlb"),
- # # #("liver", "8", "openml"), # note: we omit this dataset bc it's label was found to be incorrect (see caveat here: https://archive.ics.uci.edu/ml/datasets/liver+disorders#:~:text=The%207th%20field%20(selector)%20has%20been%20widely%20misinterpreted%20in%20the%20past%20as%20a%20dependent%20variable%20representing%20presence%20or%20absence%20of%20a%20liver%20disorder.)
- # # #("credit-g", "credit_g", 'imodels'), # like german-credit, but more feats
- # ("german-credit", "german", "pmlb"),
- #
- # #clinical-decision rules
- # #("iai-pecarn", "iai_pecarn.csv", "imodels"),
- #
- # #popular classification datasets used in rule-based modeling / fairness
- # # page 7: http://proceedings.mlr.press/v97/wang19a/wang19a.pdf
- # ("juvenile", "juvenile_clean", 'imodels'),
- # ("recidivism", "compas_two_year_clean", 'imodels'),
- # # ("credit", "credit_card_clean", 'imodels'),
- # # ("readmission", 'readmission_clean', 'imodels'), # v big
- ]
- if __name__ == '__main__':
- for d in DATASETS_CLASSIFICATION:
- gosdt_cls = OptimalTreeClassifier()
- X, y, feat_names = get_clean_dataset(d[1], data_source=d[2])
- gosdt_cls.fit(X, y)
- pass
|