Browse Source

Use only questions (not answers) as data

Tolstoyevsky 4 months ago
parent
commit
45dd5bfd79
4 changed files with 15 additions and 18 deletions
  1. 7 7
      metrics.csv
  2. 7 7
      test-metrics.csv
  3. 0 3
      tutorial/prepare_data.py
  4. 1 1
      tutorial/shared.py

+ 7 - 7
metrics.csv

@@ -1,8 +1,8 @@
 Name,Value,Timestamp,Step
-train_accuracy_score,0.7986549611511466,1578334939958,1
-train_f1_score,0.8157733652638022,1578334939958,1
-train_recall_score,0.812292820633461,1578334939958,1
-train_precision_score,0.8192838654012079,1578334939958,1
-train_roc_auc_score,0.8741755941513834,1578334939958,1
-train_pr_auc_score,0.8851843140233666,1578334939958,1
-train_balanced_accuracy_score,0.7971798884531922,1578334939958,1
+train_accuracy_score,0.920325,1578815337957,1
+train_f1_score,0.399472394950066,1578815337957,1
+train_recall_score,0.30233884768967484,1578815337957,1
+train_precision_score,0.5885619100499723,1578815337957,1
+train_roc_auc_score,0.8894910668527715,1578815337957,1
+train_pr_auc_score,0.4851428148074848,1578815337957,1
+train_balanced_accuracy_score,0.6410170700332519,1578815337957,1

+ 7 - 7
test-metrics.csv

@@ -1,8 +1,8 @@
 Name,Value,Timestamp,Step
-test_accuracy_score,0.7641,1578334954623,1
-test_f1_score,0.1771886989884897,1578334954623,1
-test_recall_score,0.6529562982005142,1578334954623,1
-test_precision_score,0.10250201775625505,1578334954623,1
-test_roc_auc_score,0.7928542407625796,1578334954623,1
-test_pr_auc_score,0.17297290030159415,1578334954623,1
-test_balanced_accuracy_score,0.710777389553904,1578334954623,1
+test_accuracy_score,0.9184,1578815352349,1
+test_f1_score,0.40611353711790393,1578815352349,1
+test_recall_score,0.3184931506849315,1578815352349,1
+test_precision_score,0.5602409638554217,1578815352349,1
+test_roc_auc_score,0.8858272952662356,1578815352349,1
+test_pr_auc_score,0.46026203657046505,1578815352349,1
+test_balanced_accuracy_score,0.6472452601298397,1578815352349,1

+ 0 - 3
tutorial/prepare_data.py

@@ -7,12 +7,9 @@ def split_train_test(ratio=0.2, random_seed=42):
     from sklearn.model_selection import train_test_split
 
     df = pd.read_csv(shared.raw_data, encoding='utf-8')
-    datascience_df = pd.read_csv(shared.raw_ds_data, encoding='utf-8')
     df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
-    datascience_df[shared.col_label] = True
 
     train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
-    train_df = pd.concat([train_df, datascience_df], axis=0)
 
     train_df.to_csv(shared.train_data, index=False)
     test_df.to_csv(shared.test_data, index=False)

+ 1 - 1
tutorial/shared.py

@@ -2,7 +2,7 @@ import os
 
 data_dir = os.path.join(os.path.dirname(__file__), '../data/')
 outputs_dir = os.path.join(os.path.dirname(__file__), '../outputs/')
-raw_data = os.path.join(data_dir, 'CrossValidated-Posts.csv')
+raw_data = os.path.join(data_dir, 'CrossValidated-Questions.csv')
 raw_ds_data = os.path.join(data_dir, 'DataScience-Posts.csv')
 train_data = os.path.join(data_dir, 'train-raw.csv')
 test_data = os.path.join(data_dir, 'test-raw.csv')