Browse Source

Properly calculate roc_auc and pr_auc

Tolstoyevsky 5 months ago
parent
commit
14b9d6a398
9 changed files with 49 additions and 35 deletions
  1. 7 6
      metrics.csv
  2. 15 5
      params.yml
  3. 1 1
      pipeline-params.yml
  4. 1 0
      pipeline.sh
  5. 7 6
      test-metrics.csv
  6. 3 4
      tutorial/eval.py
  7. 1 3
      tutorial/prepare_data.py
  8. 7 3
      tutorial/shared.py
  9. 7 7
      tutorial/train_model.py

+ 7 - 6
metrics.csv

@@ -1,7 +1,8 @@
 Name,Value,Timestamp,Step
-train_accuracy_score,0.964025,1577386911065,1
-train_f1_score,0.3847798204360839,1577386911065,1
-train_recall_score,0.28901734104046245,1577386911065,1
-train_precision_score,0.5754475703324808,1577386911065,1
-train_roc_auc_score,0.6401905892050374,1577386911065,1
-train_balanced_accuracy_score,0.6401905892050374,1577386911065,1
+train_accuracy_score,0.96385,1577635383942,1
+train_f1_score,0.1802721088435374,1577635383942,1
+train_recall_score,0.10211946050096339,1577635383942,1
+train_precision_score,0.7681159420289855,1577635383942,1
+train_roc_auc_score,0.9638118482549822,1577635383942,1
+train_pr_auc_score,0.5164149734981334,1577635383942,1
+train_balanced_accuracy_score,0.5504354293374416,1577635383942,1

+ 15 - 5
params.yml

@@ -1,6 +1,16 @@
-algorithm: SAMME.R
-base_estimator: null
-classifier_type: AdaBoostClassifier
-learning_rate: 1.0
-n_estimators: 50
+C: 1.0
+class_weight: null
+classifier_type: LogisticRegression
+dual: false
+fit_intercept: true
+intercept_scaling: 1
+l1_ratio: null
+max_iter: 100
+multi_class: auto
+n_jobs: null
+penalty: l2
 random_state: null
+solver: lbfgs
+tol: 0.0001
+verbose: 0
+warm_start: false

+ 1 - 1
pipeline-params.yml

@@ -27,7 +27,7 @@ tfidf__tfidfvectorizer__encoding: utf-8
 tfidf__tfidfvectorizer__input: content
 tfidf__tfidfvectorizer__lowercase: true
 tfidf__tfidfvectorizer__max_df: 1.0
-tfidf__tfidfvectorizer__max_features: 50000
+tfidf__tfidfvectorizer__max_features: 25000
 tfidf__tfidfvectorizer__min_df: 1
 tfidf__tfidfvectorizer__norm: l2
 tfidf__tfidfvectorizer__preprocessor: null

+ 1 - 0
pipeline.sh

@@ -1,4 +1,5 @@
 #!/bin/sh
+set -e
 python -m tutorial.prepare_data
 python -m tutorial.train_model
 python -m tutorial.eval

+ 7 - 6
test-metrics.csv

@@ -1,7 +1,8 @@
 Name,Value,Timestamp,Step
-test_accuracy_score,0.9641,1577386935982,1
-test_f1_score,0.3799654576856649,1577386935982,1
-test_recall_score,0.2827763496143959,1577386935982,1
-test_precision_score,0.5789473684210527,1577386935982,1
-test_roc_auc_score,0.6372262769817895,1577386935982,1
-test_balanced_accuracy_score,0.6372262769817896,1577386935982,1
+test_accuracy_score,0.9613,1577635403162,1
+test_f1_score,0.0936768149882904,1577635403162,1
+test_recall_score,0.05141388174807198,1577635403162,1
+test_precision_score,0.5263157894736842,1577635403162,1
+test_roc_auc_score,0.9169570321495909,1577635403162,1
+test_pr_auc_score,0.30071419241208036,1577635403162,1
+test_balanced_accuracy_score,0.5247705138633191,1577635403162,1

+ 3 - 4
tutorial/eval.py

@@ -5,17 +5,16 @@ def predict_test_set():
     X, y = shared.load_data_and_labels(shared.test_processed)
     print("Done")
     clf = shared.load(shared.classifier_pkl)
-    preds = clf.predict(X.sparse.to_coo())
-    return y, preds, clf, shared.compute_metrics(y, preds, "test")
+    return X, y, clf, shared.compute_metrics(clf, X, y, "test")
 
 def main():
-    y, preds, clf, metrics = predict_test_set()
+    X, y, clf, metrics = predict_test_set()
     from dagshub import dagshub_logger
     with dagshub_logger(metrics_path='test-metrics.csv', should_log_hparams=False) as logger:
         logger.log_metrics(metrics)
     
     # For possible interactive use
-    return y, preds, clf, metrics
+    return X, y, clf, metrics
 
 if __name__ == "__main__":
     main()

+ 1 - 3
tutorial/prepare_data.py

@@ -67,7 +67,7 @@ def build_pipeline():
     from sklearn.compose import ColumnTransformer
 
     import tutorial.prepare_data
-    tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=50000, ngram_range=(1, 2), tokenizer=tutorial.prepare_data.tokenizer)
+    tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000, ngram_range=(1, 2), tokenizer=tutorial.prepare_data.tokenizer)
     from tutorial import prepare_data # Required for proper pickling of this pipeline
     return ColumnTransformer([
         ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
@@ -105,8 +105,6 @@ def prepare_data():
     print("Saving test data")
     test_df.to_pickle(shared.test_processed)
     print("Done")
-    # save(train_df, shared.train_processed)
-    # save(test_df, shared.test_processed)
     from dagshub import dagshub_logger
     with dagshub_logger(should_log_metrics=False, hparams_path='pipeline-params.yml') as logger:
         params = {k:v for k,v in pipeline.get_params().items() if v is None or type(v) in [str,int,float,bool]}

+ 7 - 3
tutorial/shared.py

@@ -38,14 +38,18 @@ def load_labels(path=train_data):
     return pd.read_csv(path, usecols=[col_label])[col_label]
 
 
-def compute_metrics(y, preds, prefix):
-    from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score
+def compute_metrics(clf, X, y, prefix):
+    from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, auc, precision_recall_curve
+    preds = clf.predict(X)
+    probas = clf.predict_proba(X)[:,1]
+    pr_curve = precision_recall_curve(y, probas)
     return {
         f"{prefix}_accuracy_score": accuracy_score(y, preds),
         f"{prefix}_f1_score": f1_score(y, preds),
         f"{prefix}_recall_score": recall_score(y, preds),
         f"{prefix}_precision_score": precision_score(y, preds),
-        f"{prefix}_roc_auc_score": roc_auc_score(y, preds),
+        f"{prefix}_roc_auc_score": roc_auc_score(y, probas),
+        f"{prefix}_pr_auc_score": auc(pr_curve[1], pr_curve[0]),
         f"{prefix}_balanced_accuracy_score": balanced_accuracy_score(y, preds)
     }
 

+ 7 - 7
tutorial/train_model.py

@@ -7,7 +7,7 @@ def fit_model(params: dict):
     X, y = shared.load_data_and_labels(shared.train_processed)
     print("Done")
 
-    from sklearn.ensemble import AdaBoostClassifier as Classifier
+    from sklearn.linear_model import LogisticRegression as Classifier
     clf = Classifier(**params)
     print("Training model ", clf)
     # Required for efficient training, so that sklearn doesn't inflate the pandas sparse DF to a dense matrix.
@@ -16,16 +16,16 @@ def fit_model(params: dict):
     clf.fit(X_sparse, y)
     print("Done")
 
-    return y, clf.predict(X_sparse), clf
+    return X, y, clf
 
 
-def eval_on_train_data(y, preds):
-    return shared.compute_metrics(y, preds, "train")
+def eval_on_train_data(clf, X, y):
+    return shared.compute_metrics(clf, X, y, "train")
 
 
 def main(params: dict):
-    y, preds, clf = fit_model(params)
-    metrics = eval_on_train_data(y, preds)
+    X, y, clf = fit_model(params)
+    metrics = eval_on_train_data(clf, X, y)
     from dagshub import dagshub_logger
     with dagshub_logger() as logger:
         logger.log_hyperparams(clf.get_params(), classifier_type=type(clf).__name__)
@@ -33,7 +33,7 @@ def main(params: dict):
     shared.save(clf, shared.classifier_pkl)
     
     # For possible interactive use
-    return y, preds, clf, metrics
+    return X, y, clf, metrics
 
 
 if __name__ == "__main__":