Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

main.py 2.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
  1. import pandas as pd
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.linear_model import LogisticRegression
  4. from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
  5. f1_score
  6. from sklearn.model_selection import train_test_split
  7. def feature_engineering(raw_df):
  8. df = raw_df.copy()
  9. df['CreationDate'] = pd.to_datetime(df['CreationDate'])
  10. df['CreationDate_Epoch'] = df['CreationDate'].astype('int64') // 10 ** 9
  11. df['MachineLearning'] = df['Tags'].str.contains('machine-learning').fillna(False)
  12. df = df.drop(columns=['Id', 'Tags'])
  13. df['Title_Len'] = df.Title.str.len()
  14. df['Body_Len'] = df.Body.str.len()
  15. # Drop the correlated features
  16. df = df.drop(columns=['FavoriteCount'])
  17. df['Text'] = df['Title'].fillna('') + ' ' + df['Body'].fillna('')
  18. return df
  19. def fit_tfidf(train_df, test_df):
  20. tfidf = TfidfVectorizer(max_features=25000)
  21. tfidf.fit(train_df['Text'])
  22. train_tfidf = tfidf.transform(train_df['Text'])
  23. test_tfidf = tfidf.transform(test_df['Text'])
  24. return train_tfidf, test_tfidf, tfidf
  25. def fit_model(train_X, train_y):
  26. clf_tfidf = LogisticRegression(solver='sag')
  27. clf_tfidf.fit(train_X, train_y)
  28. return clf_tfidf
  29. def eval_model(clf, X, y):
  30. y_proba = clf.predict_proba(X)[:, 1]
  31. y_pred = clf.predict(X)
  32. return {
  33. 'roc_auc': roc_auc_score(y, y_proba),
  34. 'average_precision': average_precision_score(y, y_proba),
  35. 'accuracy': accuracy_score(y, y_pred),
  36. 'precision': precision_score(y, y_pred),
  37. 'recall': recall_score(y, y_pred),
  38. 'f1': f1_score(y, y_pred),
  39. }
  40. if __name__ == '__main__':
  41. print('Loading data...')
  42. df = pd.read_csv('data/CrossValidated-Questions-Nov-2020.csv')
  43. train_df, test_df = train_test_split(df)
  44. del df
  45. train_df = feature_engineering(train_df)
  46. test_df = feature_engineering(test_df)
  47. print('Fitting TFIDF...')
  48. train_tfidf, test_tfidf, tfidf = fit_tfidf(train_df, test_df)
  49. print('Fitting classifier...')
  50. train_y = train_df['MachineLearning']
  51. model = fit_model(train_tfidf, train_y)
  52. train_metrics = eval_model(model, train_tfidf, train_y)
  53. print('Train metrics:')
  54. print(train_metrics)
  55. test_metrics = eval_model(model, test_tfidf, test_df['MachineLearning'])
  56. print('Test metrics:')
  57. print(test_metrics)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...