Browse Source

Only numeric columns (no text)

Tolstoyevsky 4 months ago
parent
commit
cfbb8249c5
7 changed files with 33 additions and 56 deletions
  1. 7 7
      metrics.csv
  2. 12 28
      pipeline-params.yml
  3. 3 0
      prepare_data.sh
  4. 7 7
      test-metrics.csv
  5. 3 9
      tutorial/prepare_data.py
  6. 0 1
      tutorial/shared.py
  7. 1 4
      tutorial/train_model.py

+ 7 - 7
metrics.csv

@@ -1,8 +1,8 @@
 Name,Value,Timestamp,Step
-train_accuracy_score,0.958375,1577648523091,1
-train_f1_score,0.2590120160213618,1577648523091,1
-train_recall_score,0.18689788053949905,1577648523091,1
-train_precision_score,0.4217391304347826,1577648523091,1
-train_roc_auc_score,0.8978514278435834,1577648523091,1
-train_pr_auc_score,0.29767321187017864,1577648523091,1
-train_balanced_accuracy_score,0.5882594389301038,1577648523091,1
+train_accuracy_score,0.961075,1577720954467,1
+train_f1_score,0.0,1577720954467,1
+train_recall_score,0.0,1577720954467,1
+train_precision_score,0.0,1577720954467,1
+train_roc_auc_score,0.8239694210836983,1577720954467,1
+train_pr_auc_score,0.12138721444521103,1577720954467,1
+train_balanced_accuracy_score,0.5,1577720954467,1

+ 12 - 28
pipeline-params.yml

@@ -1,33 +1,17 @@
 n_jobs: null
+num_cols__memory: null
+num_cols__simpleimputer__add_indicator: false
+num_cols__simpleimputer__copy: true
+num_cols__simpleimputer__fill_value: null
+num_cols__simpleimputer__missing_values: .nan
+num_cols__simpleimputer__strategy: mean
+num_cols__simpleimputer__verbose: 0
+num_cols__standardscaler__copy: true
+num_cols__standardscaler__with_mean: true
+num_cols__standardscaler__with_std: true
+num_cols__verbose: false
 passthrough: passthrough
 remainder: drop
-sparse_threshold: 0.3
-tfidf__functiontransformer__accept_sparse: false
-tfidf__functiontransformer__check_inverse: true
-tfidf__functiontransformer__inv_kw_args: null
-tfidf__functiontransformer__inverse_func: null
-tfidf__functiontransformer__kw_args: null
-tfidf__functiontransformer__validate: false
-tfidf__memory: null
-tfidf__tfidfvectorizer__analyzer: word
-tfidf__tfidfvectorizer__binary: false
-tfidf__tfidfvectorizer__decode_error: strict
-tfidf__tfidfvectorizer__encoding: utf-8
-tfidf__tfidfvectorizer__input: content
-tfidf__tfidfvectorizer__lowercase: true
-tfidf__tfidfvectorizer__max_df: 1.0
-tfidf__tfidfvectorizer__max_features: 25000
-tfidf__tfidfvectorizer__min_df: 1
-tfidf__tfidfvectorizer__norm: l2
-tfidf__tfidfvectorizer__preprocessor: null
-tfidf__tfidfvectorizer__smooth_idf: true
-tfidf__tfidfvectorizer__stop_words: english
-tfidf__tfidfvectorizer__strip_accents: null
-tfidf__tfidfvectorizer__sublinear_tf: false
-tfidf__tfidfvectorizer__token_pattern: (?u)\b\w\w+\b
-tfidf__tfidfvectorizer__tokenizer: null
-tfidf__tfidfvectorizer__use_idf: true
-tfidf__tfidfvectorizer__vocabulary: null
-tfidf__verbose: false
+sparse_threshold: 1.0
 transformer_weights: null
 verbose: false

+ 3 - 0
prepare_data.sh

@@ -0,0 +1,3 @@
+#!/bin/sh
+set -e
+python -m tutorial.prepare_data

+ 7 - 7
test-metrics.csv

@@ -1,8 +1,8 @@
 Name,Value,Timestamp,Step
-test_accuracy_score,0.9587,1577648537873,1
-test_f1_score,0.2817391304347826,1577648537873,1
-test_recall_score,0.20822622107969152,1577648537873,1
-test_precision_score,0.43548387096774194,1577648537873,1
-test_roc_auc_score,0.8590647926714221,1577648537873,1
-test_pr_auc_score,0.2785421871491039,1577648537873,1
-test_balanced_accuracy_score,0.5986506196439972,1577648537873,1
+test_accuracy_score,0.9611,1577720956976,1
+test_f1_score,0.0,1577720956976,1
+test_recall_score,0.0,1577720956976,1
+test_precision_score,0.0,1577720956976,1
+test_roc_auc_score,0.8052171636024382,1577720956976,1
+test_pr_auc_score,0.10107128712776992,1577720956976,1
+test_balanced_accuracy_score,0.5,1577720956976,1

+ 3 - 9
tutorial/prepare_data.py

@@ -9,9 +9,6 @@ def split_train_test(ratio=0.2, random_seed=42):
     df = pd.read_csv(shared.raw_data, encoding='utf-8')
     df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
 
-    df_positive = df[df[shared.col_label]]
-    df_negative = df[df[shared.col_label] != True]
-
     train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
 
     train_df.to_csv(shared.train_data, index=False)
@@ -62,23 +59,20 @@ def tokenizer(s):
 def build_pipeline():
     from sklearn.impute import SimpleImputer
     from sklearn.preprocessing import StandardScaler, FunctionTransformer
-    from sklearn.feature_extraction.text import TfidfVectorizer
     from sklearn.pipeline import make_pipeline
     from sklearn.compose import ColumnTransformer
 
     import tutorial.prepare_data
-    tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000)
     from tutorial import prepare_data # Required for proper pickling of this pipeline
     return ColumnTransformer([
         ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
-        ('tfidf', make_pipeline(FunctionTransformer(prepare_data.text_col), tfidf), shared.text_cols)
+        ('num_cols', make_pipeline(SimpleImputer(),StandardScaler()), shared.extra_feature_cols),
     ])
 
 
 def map_dataframe(df, pipeline):
-    tfidf_cols = [f'Text_{col}' for col in pipeline.named_transformers_.tfidf[1].get_feature_names()]
-    cols = [shared.col_id, shared.col_label] + tfidf_cols
-    return pd.DataFrame.sparse.from_spmatrix(pipeline.transform(df), columns=cols)
+    cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols
+    return pd.DataFrame(pipeline.transform(df).astype(float), columns=cols)
 
 
 def prepare_data():

+ 0 - 1
tutorial/shared.py

@@ -40,7 +40,6 @@ def load_labels(path=train_data):
 
 def compute_metrics(clf, X, y, prefix):
     from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, auc, precision_recall_curve
-    X = X.sparse.to_coo()
     preds = clf.predict(X)
     probas = clf.predict_proba(X)[:,1]
     pr_curve = precision_recall_curve(y, probas)

+ 1 - 4
tutorial/train_model.py

@@ -23,10 +23,7 @@ def fit_model(params: dict):
     from sklearn.ensemble import AdaBoostClassifier as Classifier
     clf = Classifier(**params)
     print("Training model ", clf)
-    # Required for efficient training, so that sklearn doesn't inflate the pandas sparse DF to a dense matrix.
-    # sklearn only supports scipy sparse matrices.
-    X_sparse = X.sparse.to_coo() 
-    clf.fit(X_sparse, y)
+    clf.fit(X, y)
     print("Done")
 
     return X, y, clf