Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

main.py 3.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
  1. import argparse
  2. import pandas as pd
  3. from sklearn.feature_extraction.text import TfidfVectorizer
  4. from sklearn.linear_model import SGDClassifier
  5. from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
  6. f1_score
  7. from sklearn.model_selection import train_test_split
  8. import joblib
  9. import dagshub
  10. # Consts
  11. CLASS_LABEL = 'label'
  12. train_df_path = 'data/train.csv.zip'
  13. test_df_path = 'data/test.csv.zip'
  14. def feature_engineering(raw_df):
  15. df = raw_df.copy()
  16. df['len'] = df.comment.str.len()
  17. df['comment'] = df['comment'].fillna('')
  18. df = df.drop(columns=['isHate'])
  19. return df
  20. def fit_tfidf(train_df, test_df):
  21. tfidf = TfidfVectorizer(max_features=25000)
  22. tfidf.fit(train_df['comment'])
  23. train_tfidf = tfidf.transform(train_df['comment'])
  24. test_tfidf = tfidf.transform(test_df['comment'])
  25. return train_tfidf, test_tfidf, tfidf
  26. def fit_model(train_X, train_y, random_state=42):
  27. clf_tfidf = SGDClassifier(loss='modified_huber', random_state=random_state)
  28. clf_tfidf.fit(train_X, train_y)
  29. return clf_tfidf
  30. def eval_model(clf, X, y):
  31. y_proba = clf.predict_proba(X)[:, 1]
  32. y_pred = clf.predict(X)
  33. return {
  34. 'roc_auc': roc_auc_score(y, y_proba),
  35. 'average_precision': average_precision_score(y, y_proba),
  36. 'accuracy': accuracy_score(y, y_pred),
  37. 'precision': precision_score(y, y_pred),
  38. 'recall': recall_score(y, y_pred),
  39. 'f1': f1_score(y, y_pred),
  40. }
  41. def split(random_state=42):
  42. print('Loading data...')
  43. df = pd.read_csv('data/Ethos_Dataset_Binary.csv', delimiter=';')
  44. df[CLASS_LABEL] = df.isHate.apply(lambda x: float(x>=0.5))
  45. # df[CLASS_LABEL] = df['Tags'].str.contains('machine-learning').fillna(False)
  46. train_df, test_df = train_test_split(df, random_state=random_state, stratify=df[CLASS_LABEL])
  47. print('Saving split data...')
  48. train_df.to_csv(train_df_path)
  49. test_df.to_csv(test_df_path)
  50. def train():
  51. print('Loading data...')
  52. train_df = pd.read_csv(train_df_path)
  53. test_df = pd.read_csv(test_df_path)
  54. print('Engineering features...')
  55. train_df = feature_engineering(train_df)
  56. test_df = feature_engineering(test_df)
  57. with dagshub.dagshub_logger() as logger:
  58. print('Fitting TFIDF...')
  59. train_tfidf, test_tfidf, tfidf = fit_tfidf(train_df, test_df)
  60. print('Saving TFIDF object...')
  61. joblib.dump(tfidf, 'outputs/tfidf.joblib')
  62. logger.log_hyperparams({'tfidf': tfidf.get_params()})
  63. print('Training model...')
  64. train_y = train_df[CLASS_LABEL]
  65. model = fit_model(train_tfidf, train_y)
  66. print('Saving trained model...')
  67. joblib.dump(model, 'outputs/model.joblib')
  68. logger.log_hyperparams(model_class=type(model).__name__)
  69. logger.log_hyperparams({'model': model.get_params()})
  70. print('Evaluating model...')
  71. train_metrics = eval_model(model, train_tfidf, train_y)
  72. print('Train metrics:')
  73. print(train_metrics)
  74. logger.log_metrics({f'train__{k}': v for k,v in train_metrics.items()})
  75. test_metrics = eval_model(model, test_tfidf, test_df[CLASS_LABEL])
  76. print('Test metrics:')
  77. print(test_metrics)
  78. logger.log_metrics({f'test__{k}': v for k,v in test_metrics.items()})
  79. if __name__ == '__main__':
  80. parser = argparse.ArgumentParser()
  81. subparsers = parser.add_subparsers(title='Split or Train step:', dest='step')
  82. subparsers.required = True
  83. split_parser = subparsers.add_parser('split')
  84. split_parser.set_defaults(func=split)
  85. train_parser = subparsers.add_parser('train')
  86. train_parser.set_defaults(func=train)
  87. parser.parse_args().func()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...