1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
|
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.linear_model import LogisticRegression
- from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
- f1_score
- from sklearn.model_selection import train_test_split
- def feature_engineering(raw_df):
- df = raw_df.copy()
- df['len'] = df.comment.str.len()
- df['comment'] = df['comment'].fillna('')
- df['label'] = df.isHate.apply(lambda x: float(x>=0.5))
- df = df.drop(columns=['isHate'])
- return df
- def fit_tfidf(train_df, test_df):
- tfidf = TfidfVectorizer(max_features=25000)
- tfidf.fit(train_df['comment'])
- train_tfidf = tfidf.transform(train_df['comment'])
- test_tfidf = tfidf.transform(test_df['comment'])
- return train_tfidf, test_tfidf, tfidf
- def fit_model(train_X, train_y):
- clf_tfidf = LogisticRegression(solver='sag')
- clf_tfidf.fit(train_X, train_y)
- return clf_tfidf
- def eval_model(clf, X, y):
- y_proba = clf.predict_proba(X)[:, 1]
- y_pred = clf.predict(X)
- return {
- 'roc_auc': roc_auc_score(y, y_proba),
- 'average_precision': average_precision_score(y, y_proba),
- 'accuracy': accuracy_score(y, y_pred),
- 'precision': precision_score(y, y_pred),
- 'recall': recall_score(y, y_pred),
- 'f1': f1_score(y, y_pred),
- }
- if __name__ == '__main__':
- print('Loading data...')
- df = pd.read_csv('data/Ethos_Dataset_Binary.csv', delimiter=';')
- train_df, test_df = train_test_split(df)
- train_df = feature_engineering(train_df)
- test_df = feature_engineering(test_df)
- print('Fitting TFIDF...')
- train_tfidf, test_tfidf, tfidf = fit_tfidf(train_df, test_df)
- print('Fitting classifier...')
- train_y = train_df['label']
- model = fit_model(train_tfidf, train_y)
- train_metrics = eval_model(model, train_tfidf, train_y)
- print('Train metrics:')
- print(train_metrics)
- test_metrics = eval_model(model, test_tfidf, test_df['label'])
- print('Test metrics:')
- print(test_metrics)
|