1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
|
- import argparse
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.linear_model import SGDClassifier
- from transformers import DistilBertTokenizerFast
- from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
- from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
- f1_score
- from sklearn.model_selection import train_test_split
- import joblib
- import dagshub
- import torch
- # Consts
- CLASS_LABEL = 'label'
- train_df_path = 'data/train.csv.zip'
- test_df_path = 'data/test.csv.zip'
- class HSDataset(torch.utils.data.Dataset):
- def __init__(self, encodings, labels):
- self.encodings = encodings
- self.labels = labels
- def __getitem__(self, idx):
- item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
- item['label'] = torch.tensor(self.labels[idx])
- return item
- def __len__(self):
- return len(self.labels)
- def feature_engineering(raw_df):
- df = raw_df.copy()
- df['len'] = df.comment.str.len()
- df['comment'] = df['comment'].fillna('')
- df = df.drop(columns=['isHate'])
- return df
- def fit_tfidf(train_df, test_df):
- tfidf = TfidfVectorizer(max_features=25000)
- tfidf.fit(train_df['comment'])
- train_tfidf = tfidf.transform(train_df['comment'])
- test_tfidf = tfidf.transform(test_df['comment'])
- return train_tfidf, test_tfidf, tfidf
- def fit_tokenizer(train_df, test_df):
- tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
- train_tokens = tokenizer(train_df['comment'].values.tolist(), truncation=True, padding=True)
- test_tokens = tokenizer(test_df['comment'].values.tolist(), truncation=True, padding=True)
- return train_tokens, test_tokens
- def fit_model(train_ds, test_ds, training_args):
- model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
- trainer = Trainer(
- model=model, # the instantiated 🤗 Transformers model to be trained
- args=training_args, # training arguments, defined above
- train_dataset=train_ds, # training dataset
- eval_dataset=test_ds # evaluation dataset
- )
- trainer.train()
- return trainer
- def eval_model(clf, ds):
- # y_proba = clf.predict_proba(X)[:, 1]
- y = ds.labels
- y_pred = clf.predict(ds)
- return {
- # 'roc_auc': roc_auc_score(y, y_proba),
- # 'average_precision': average_precision_score(y, y_proba),
- 'accuracy': accuracy_score(y, y_pred.label_ids),
- 'precision': precision_score(y, y_pred.label_ids),
- 'recall': recall_score(y, y_pred.label_ids),
- 'f1': f1_score(y, y_pred.label_ids),
- }
- def split(random_state=42):
- print('Loading data...')
- df = pd.read_csv('data/Ethos_Dataset_Binary.csv', delimiter=';')
- df[CLASS_LABEL] = df.isHate.apply(lambda x: float(x>=0.5))
- # df[CLASS_LABEL] = df['Tags'].str.contains('machine-learning').fillna(False)
- train_df, test_df = train_test_split(df, random_state=random_state, stratify=df[CLASS_LABEL])
- print('Saving split data...')
- train_df.to_csv(train_df_path)
- test_df.to_csv(test_df_path)
- def train():
- print('Loading data...')
- train_df = pd.read_csv(train_df_path)
- test_df = pd.read_csv(test_df_path)
- # print('Engineering features...')
- # train_df = feature_engineering(train_df)
- # test_df = feature_engineering(test_df)
- with dagshub.dagshub_logger() as logger:
- print('Fitting Tokenizer..')
- train_tokens, test_tokens = fit_tokenizer(train_df, test_df)
- # print('Saving TFIDF object...')
- # joblib.dump(tfidf, 'outputs/tfidf.joblib')
- # logger.log_hyperparams({'tfidf': tfidf.get_params()})
- print('Training model...')
- train_y = train_df[CLASS_LABEL]
- test_y = test_df[CLASS_LABEL]
- train_dataset = HSDataset(train_tokens, train_y.astype(int))
- test_dataset = HSDataset(test_tokens, test_y.astype(int))
- training_args = TrainingArguments(
- output_dir='./results', # output directory
- num_train_epochs=3, # total number of training epochs
- per_device_train_batch_size=16, # batch size per device during training
- per_device_eval_batch_size=64, # batch size for evaluation
- warmup_steps=500, # number of warmup steps for learning rate scheduler
- weight_decay=0.01, # strength of weight decay
- logging_dir='./logs', # directory for storing logs
- logging_steps=10,
- )
- model = fit_model(train_dataset, test_dataset, training_args)
- # print('Saving trained model...')
- # joblib.dump(model, 'outputs/model.joblib')
- logger.log_hyperparams(model_class=type(model).__name__)
- logger.log_hyperparams({'model': model.args})
- print('Evaluating model...')
- train_metrics = eval_model(model, train_dataset)
- print('Train metrics:')
- print(train_metrics)
- logger.log_metrics({f'train__{k}': v for k,v in train_metrics.items()})
- test_metrics = eval_model(model, test_dataset)
- print('Test metrics:')
- print(test_metrics)
- logger.log_metrics({f'test__{k}': v for k,v in test_metrics.items()})
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- subparsers = parser.add_subparsers(title='Split or Train step:', dest='step')
- subparsers.required = True
- split_parser = subparsers.add_parser('split')
- split_parser.set_defaults(func=split)
- train_parser = subparsers.add_parser('train')
- train_parser.set_defaults(func=train)
- parser.parse_args().func()
|