Browse Source

Training code seems done

Tolstoyevsky 5 months ago
parent
commit
de46c0aa1a
8 changed files with 161 additions and 2 deletions
  1. 2 0
      .gitignore
  2. 7 0
      metrics.csv
  3. 10 0
      params.yml
  4. 2 2
      requirements.txt
  5. 0 0
      tutorial/__init__.py
  6. 62 0
      tutorial/prepare_data.py
  7. 32 0
      tutorial/shared.py
  8. 46 0
      tutorial/train_model.py

+ 2 - 0
.gitignore

@@ -1,6 +1,8 @@
 /data/
 /env/
+/outputs/
 
 # IDE
 .vscode/
 .idea/
+__pycache__/

+ 7 - 0
metrics.csv

@@ -0,0 +1,7 @@
+Name,Value,Timestamp,Step
+accuracy_score,0.96865,1577018411759,1
+f1_score,0.339304531085353,1577018411759,1
+recall_score,0.20680796403339757,1577018411759,1
+precision_score,0.9442815249266863,1577018411759,1
+roc_auc_score,0.6031568629052871,1577018411759,1
+balanced_accuracy_score,0.6031568629052871,1577018411759,1

+ 10 - 0
params.yml

@@ -0,0 +1,10 @@
+alpha: 1.0
+class_weight: null
+classifier_type: RidgeClassifier
+copy_X: true
+fit_intercept: true
+max_iter: null
+normalize: false
+random_state: null
+solver: auto
+tol: 0.001

+ 2 - 2
requirements.txt

@@ -1,3 +1,3 @@
-pandas
 sklearn
-dagshub
+pandas
+dagshub

+ 0 - 0
tutorial/__init__.py


+ 62 - 0
tutorial/prepare_data.py

@@ -0,0 +1,62 @@
+from . import shared
+from .shared import save
+
+def split_train_test(ratio=0.2, random_seed=42):
+    import pandas as pd
+    from sklearn.model_selection import train_test_split
+
+    df = pd.read_csv(shared.raw_data, encoding='utf-8')
+    df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
+
+    df_positive = df[df[shared.col_label]]
+    df_negative = df[df[shared.col_label] != True]
+
+    train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
+
+    train_df.to_csv(shared.train_data, index=False)
+    test_df.to_csv(shared.test_data, index=False)
+
+
+def text_preprocessing(s):
+    # TODO: Clean HTML tags
+    return s
+
+
+def vectorize_text():
+    import pandas as pd
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    
+    train_df = pd.read_csv(shared.train_data, encoding='utf-8')
+    test_df = pd.read_csv(shared.test_data, encoding='utf-8')
+
+    def text_col(df):
+        df[shared.col_text] = (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False)
+
+    text_col(train_df)
+    text_col(test_df)
+    
+    vectorizer = TfidfVectorizer(encoding='utf-8', preprocessor=text_preprocessing, stop_words='english', analyzer='word', max_features=50000)
+    vectorizer.fit(train_df[shared.col_text])
+
+    train_tfidf_mat = vectorizer.transform(train_df[shared.col_text])
+    test_tfidf_mat = vectorizer.transform(test_df[shared.col_text])
+
+    return train_df, test_df, vectorizer, train_tfidf_mat, test_tfidf_mat
+
+def prepare_data():
+    train_df, test_df, vectorizer, train_tfidf_mat, test_tfidf_mat = vectorize_text()
+    save(vectorizer, shared.vectorizer_pkl)
+    train_df[[shared.col_id, shared.col_text]].to_csv(shared.train_processed, index=False)
+    test_df[[shared.col_id, shared.col_text]].to_csv(shared.test_processed, index=False)
+    save(train_tfidf_mat, shared.train_tfidf)
+    save(test_tfidf_mat, shared.test_tfidf)
+
+
+def main():
+    split_train_test()
+    prepare_data()
+
+
+if __name__ == "__main__":
+    main()
+    

+ 32 - 0
tutorial/shared.py

@@ -0,0 +1,32 @@
+import os
+
+data_dir = os.path.join(os.path.dirname(__file__), '../data/')
+outputs_dir = os.path.join(os.path.dirname(__file__), '../outputs/')
+raw_data = os.path.join(data_dir, 'CrossValidated-Posts.csv')
+train_data = os.path.join(data_dir, 'train-raw.csv')
+test_data = os.path.join(data_dir, 'test-raw.csv')
+train_processed = os.path.join(data_dir, 'train-processed.csv')
+test_processed = os.path.join(data_dir, 'test-processed.csv')
+train_tfidf = os.path.join(data_dir, 'train-tfidf.pkl')
+test_tfidf = os.path.join(data_dir, 'test-tfidf.pkl')
+vectorizer_pkl = os.path.join(outputs_dir, 'tfidf-vectorizer.pkl')
+classifier_pkl = os.path.join(outputs_dir, 'classifier.pkl')
+
+col_id = 'Id'
+col_text = 'Text'
+col_title = 'Title'
+col_body = 'Body'
+col_tags = 'Tags'
+col_label = 'IsTaggedML'
+
+
+def save(obj, path):
+    import pickle
+    with open(path, 'wb') as f:
+        pickle.dump(obj, f)
+
+
+def load(path):
+    import pickle
+    with open(path, 'rb') as f:
+        return pickle.load(f)

+ 46 - 0
tutorial/train_model.py

@@ -0,0 +1,46 @@
+import pandas as pd
+from . import shared
+
+
+def load_labels(path=shared.train_data):
+    return pd.read_csv(path, usecols=[shared.col_label])[shared.col_label]
+
+
+def fit_model():
+    tfidf = shared.load(shared.train_tfidf)
+    y = load_labels()
+
+    from sklearn.linear_model import RidgeClassifier
+    clf = RidgeClassifier()
+    clf.fit(tfidf, y)
+
+    return y, clf.predict(tfidf), clf
+
+
+def eval_on_train_data(y, preds):
+    from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score
+    return {
+        "train_accuracy_score": accuracy_score(y, preds),
+        "train_f1_score": f1_score(y, preds),
+        "train_recall_score": recall_score(y, preds),
+        "train_precision_score": precision_score(y, preds),
+        "train_roc_auc_score": roc_auc_score(y, preds),
+        "train_balanced_accuracy_score": balanced_accuracy_score(y, preds)
+    }
+
+
+def main():
+    y, preds, clf = fit_model()
+    metrics = eval_on_train_data(y, preds)
+    from dagshub import dagshub_logger
+    with dagshub_logger() as logger:
+        logger.log_hyperparams(clf.get_params(), classifier_type=type(clf).__name__)
+        logger.log_metrics(metrics)
+    shared.save(clf, shared.classifier_pkl)
+    
+    # For possible interactive use
+    return y, preds, clf, metrics
+
+
+if __name__ == "__main__":
+    main()