Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

main.py 2.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
  1. import pandas as pd
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
  5. f1_score
  6. from sklearn.model_selection import train_test_split
  7. def feature_engineering(raw_df):
  8. df = raw_df.copy()
  9. df['len'] = df.comment.str.len()
  10. df['comment'] = df['comment'].fillna('')
  11. df['label'] = df.isHate.apply(lambda x: float(x>=0.5))
  12. df = df.drop(columns=['isHate'])
  13. return df
  14. def fit_tfidf(train_df, test_df):
  15. tfidf = TfidfVectorizer(max_features=25000)
  16. tfidf.fit(train_df['comment'])
  17. train_tfidf = tfidf.transform(train_df['comment'])
  18. test_tfidf = tfidf.transform(test_df['comment'])
  19. return train_tfidf, test_tfidf, tfidf
  20. def fit_model(train_X, train_y):
  21. clf_tfidf = LogisticRegression(solver='sag')
  22. clf_tfidf.fit(train_X, train_y)
  23. return clf_tfidf
  24. def eval_model(clf, X, y):
  25. y_proba = clf.predict_proba(X)[:, 1]
  26. y_pred = clf.predict(X)
  27. return {
  28. 'roc_auc': roc_auc_score(y, y_proba),
  29. 'average_precision': average_precision_score(y, y_proba),
  30. 'accuracy': accuracy_score(y, y_pred),
  31. 'precision': precision_score(y, y_pred),
  32. 'recall': recall_score(y, y_pred),
  33. 'f1': f1_score(y, y_pred),
  34. }
  35. if __name__ == '__main__':
  36. print('Loading data...')
  37. df = pd.read_csv('data/Ethos_Dataset_Binary.csv', delimiter=';')
  38. train_df, test_df = train_test_split(df)
  39. train_df = feature_engineering(train_df)
  40. test_df = feature_engineering(test_df)
  41. print('Fitting TFIDF...')
  42. train_tfidf, test_tfidf, tfidf = fit_tfidf(train_df, test_df)
  43. print('Fitting classifier...')
  44. train_y = train_df['label']
  45. model = fit_model(train_tfidf, train_y)
  46. train_metrics = eval_model(model, train_tfidf, train_y)
  47. print('Train metrics:')
  48. print(train_metrics)
  49. test_metrics = eval_model(model, test_tfidf, test_df['label'])
  50. print('Test metrics:')
  51. print(test_metrics)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...