Browse Source

Added the "text len" column

Tolstoyevsky 4 months ago
parent
commit
db4fd2a289
7 changed files with 66 additions and 39 deletions
  1. 7 7
      metrics.csv
  2. 36 1
      pipeline-params.yml
  3. 7 7
      test-metrics.csv
  4. 13 5
      tutorial/prepare_data.py
  5. 2 0
      tutorial/shared.py
  6. 1 1
      tutorial/train_model.py
  7. 0 18
      vectorizer-params.yml

+ 7 - 7
metrics.csv

@@ -1,8 +1,8 @@
 Name,Value,Timestamp,Step
-train_accuracy_score,0.961075,1577720954467,1
-train_f1_score,0.0,1577720954467,1
-train_recall_score,0.0,1577720954467,1
-train_precision_score,0.0,1577720954467,1
-train_roc_auc_score,0.8239694210836983,1577720954467,1
-train_pr_auc_score,0.12138721444521103,1577720954467,1
-train_balanced_accuracy_score,0.5,1577720954467,1
+train_accuracy_score,0.964425,1578236445541,1
+train_f1_score,0.39626644039032666,1578236445541,1
+train_recall_score,0.2999357739242132,1578236445541,1
+train_precision_score,0.58375,1578236445541,1
+train_roc_auc_score,0.9524198435000841,1578236445541,1
+train_pr_auc_score,0.4720101231706403,1578236445541,1
+train_balanced_accuracy_score,0.6456367993778911,1578236445541,1

+ 36 - 1
pipeline-params.yml

@@ -12,6 +12,41 @@ num_cols__standardscaler__with_std: true
 num_cols__verbose: false
 passthrough: passthrough
 remainder: drop
-sparse_threshold: 1.0
+sparse_threshold: 0.3
+text__featureunion__functiontransformer__accept_sparse: false
+text__featureunion__functiontransformer__check_inverse: true
+text__featureunion__functiontransformer__inv_kw_args: null
+text__featureunion__functiontransformer__inverse_func: null
+text__featureunion__functiontransformer__kw_args: null
+text__featureunion__functiontransformer__validate: false
+text__featureunion__n_jobs: null
+text__featureunion__tfidfvectorizer__analyzer: word
+text__featureunion__tfidfvectorizer__binary: false
+text__featureunion__tfidfvectorizer__decode_error: strict
+text__featureunion__tfidfvectorizer__encoding: utf-8
+text__featureunion__tfidfvectorizer__input: content
+text__featureunion__tfidfvectorizer__lowercase: true
+text__featureunion__tfidfvectorizer__max_df: 1.0
+text__featureunion__tfidfvectorizer__max_features: 25000
+text__featureunion__tfidfvectorizer__min_df: 1
+text__featureunion__tfidfvectorizer__norm: l2
+text__featureunion__tfidfvectorizer__preprocessor: null
+text__featureunion__tfidfvectorizer__smooth_idf: true
+text__featureunion__tfidfvectorizer__stop_words: english
+text__featureunion__tfidfvectorizer__strip_accents: null
+text__featureunion__tfidfvectorizer__sublinear_tf: false
+text__featureunion__tfidfvectorizer__token_pattern: (?u)\b\w\w+\b
+text__featureunion__tfidfvectorizer__use_idf: true
+text__featureunion__tfidfvectorizer__vocabulary: null
+text__featureunion__transformer_weights: null
+text__featureunion__verbose: false
+text__functiontransformer__accept_sparse: false
+text__functiontransformer__check_inverse: true
+text__functiontransformer__inv_kw_args: null
+text__functiontransformer__inverse_func: null
+text__functiontransformer__kw_args: null
+text__functiontransformer__validate: false
+text__memory: null
+text__verbose: false
 transformer_weights: null
 verbose: false

+ 7 - 7
test-metrics.csv

@@ -1,8 +1,8 @@
 Name,Value,Timestamp,Step
-test_accuracy_score,0.9611,1577720956976,1
-test_f1_score,0.0,1577720956976,1
-test_recall_score,0.0,1577720956976,1
-test_precision_score,0.0,1577720956976,1
-test_roc_auc_score,0.8052171636024382,1577720956976,1
-test_pr_auc_score,0.10107128712776992,1577720956976,1
-test_balanced_accuracy_score,0.5,1577720956976,1
+test_accuracy_score,0.9653,1578236459984,1
+test_f1_score,0.4006908462867012,1578236459984,1
+test_recall_score,0.2982005141388175,1578236459984,1
+test_precision_score,0.6105263157894737,1578236459984,1
+test_roc_auc_score,0.9346758574352064,1578236459984,1
+test_pr_auc_score,0.4627289977729209,1578236459984,1
+test_balanced_accuracy_score,0.645250501580906,1578236459984,1

+ 13 - 5
tutorial/prepare_data.py

@@ -42,6 +42,10 @@ def text_col(df):
     return (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False).apply(text_preprocess)
 
 
+def text_len_col(text):
+    return text.str.len().values.reshape(-1, 1)
+
+
 import re
 token_pattern = re.compile(r"(?u)\b\w\w+\b")
 # TODO: Better number pattern
@@ -59,20 +63,24 @@ def tokenizer(s):
 def build_pipeline():
     from sklearn.impute import SimpleImputer
     from sklearn.preprocessing import StandardScaler, FunctionTransformer
-    from sklearn.pipeline import make_pipeline
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    from sklearn.pipeline import make_pipeline, make_union
     from sklearn.compose import ColumnTransformer
 
-    import tutorial.prepare_data
     from tutorial import prepare_data # Required for proper pickling of this pipeline
+    tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000, ngram_range=(1, 2), tokenizer=prepare_data.tokenizer)
+    union = make_union(FunctionTransformer(prepare_data.text_len_col), tfidf)
     return ColumnTransformer([
         ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
-        ('num_cols', make_pipeline(SimpleImputer(),StandardScaler()), shared.extra_feature_cols),
+        ('num_cols', make_pipeline(SimpleImputer(), StandardScaler()), shared.extra_feature_cols),
+        ('text', make_pipeline(FunctionTransformer(prepare_data.text_col), union), shared.text_cols)
     ])
 
 
 def map_dataframe(df, pipeline):
-    cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols
-    return pd.DataFrame(pipeline.transform(df).astype(float), columns=cols)
+    tfidf_cols = [f'Text_{col}' for col in pipeline.named_transformers_.text[1].transformer_list[1][1].get_feature_names()]
+    cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols + [shared.text_len_col] + tfidf_cols
+    return pd.DataFrame.sparse.from_spmatrix(pipeline.transform(df).astype(float), columns=cols)
 
 
 def prepare_data():

+ 2 - 0
tutorial/shared.py

@@ -18,6 +18,7 @@ col_tags = 'Tags'
 col_label = 'IsTaggedML'
 extra_feature_cols = ['Score','ViewCount','AnswerCount','CommentCount','FavoriteCount']
 text_cols = [col_title, col_body]
+text_len_col = 'Text_Len'
 all_raw_cols = ['Id','Title','Body','Tags','CreationDate','Score','ViewCount','AnswerCount','CommentCount','FavoriteCount','IsTaggedML']
 
 
@@ -40,6 +41,7 @@ def load_labels(path=train_data):
 
 def compute_metrics(clf, X, y, prefix):
     from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, auc, precision_recall_curve
+    X = X.sparse.to_coo()
     preds = clf.predict(X)
     probas = clf.predict_proba(X)[:,1]
     pr_curve = precision_recall_curve(y, probas)

+ 1 - 1
tutorial/train_model.py

@@ -23,7 +23,7 @@ def fit_model(params: dict):
     from sklearn.ensemble import AdaBoostClassifier as Classifier
     clf = Classifier(**params)
     print("Training model ", clf)
-    clf.fit(X, y)
+    clf.fit(X.sparse.to_coo(), y)
     print("Done")
 
     return X, y, clf

+ 0 - 18
vectorizer-params.yml

@@ -1,18 +0,0 @@
-analyzer: word
-binary: false
-decode_error: strict
-encoding: utf-8
-input: content
-lowercase: true
-max_df: 1.0
-max_features: 50000
-min_df: 1
-norm: l2
-smooth_idf: true
-stop_words: english
-strip_accents: null
-sublinear_tf: false
-token_pattern: (?u)\b\w\w+\b
-tokenizer: null
-use_idf: true
-vocabulary: null