1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
|
- import dagshub
- import mlflow
- import argparse
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.linear_model import SGDClassifier
- from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
- f1_score
- from sklearn.model_selection import train_test_split
- import joblib
- DAGSHUB_REPO_OWNER = "<username>"
- DAGSHUB_REPO = "DAGsHub-Tutorial"
- dagshub.init(DAGSHUB_REPO, DAGSHUB_REPO_OWNER)
- # Consts
- CLASS_LABEL = 'MachineLearning'
- train_df_path = 'data/train.csv.zip'
- test_df_path = 'data/test.csv.zip'
- def get_or_create_experiment_id(name):
- exp = mlflow.get_experiment_by_name(name)
- if exp is None:
- exp_id = mlflow.create_experiment(name)
- return exp_id
- return exp.experiment_id
- def feature_engineering(raw_df):
- df = raw_df.copy()
- df['CreationDate'] = pd.to_datetime(df['CreationDate'])
- df['CreationDate_Epoch'] = df['CreationDate'].astype('int64') // 10 ** 9
- df = df.drop(columns=['Id', 'Tags'])
- df['Title_Len'] = df.Title.str.len()
- df['Body_Len'] = df.Body.str.len()
- # Drop the correlated features
- df = df.drop(columns=['FavoriteCount'])
- df['Text'] = df['Title'].fillna('') + ' ' + df['Body'].fillna('')
- return df
- def fit_tfidf(train_df, test_df):
- tfidf = TfidfVectorizer(max_features=25000)
- tfidf.fit(train_df['Text'])
- train_tfidf = tfidf.transform(train_df['Text'])
- test_tfidf = tfidf.transform(test_df['Text'])
- return train_tfidf, test_tfidf, tfidf
- def fit_model(train_X, train_y, random_state=42):
- clf_tfidf = SGDClassifier(loss='modified_huber', random_state=random_state)
- clf_tfidf.fit(train_X, train_y)
- return clf_tfidf
- def eval_model(clf, X, y):
- y_proba = clf.predict_proba(X)[:, 1]
- y_pred = clf.predict(X)
- return {
- 'roc_auc': roc_auc_score(y, y_proba),
- 'average_precision': average_precision_score(y, y_proba),
- 'accuracy': accuracy_score(y, y_pred),
- 'precision': precision_score(y, y_pred),
- 'recall': recall_score(y, y_pred),
- 'f1': f1_score(y, y_pred),
- }
- def split(random_state=42):
- print('Loading data...')
- df = pd.read_csv('data/CrossValidated-Questions.csv')
- df[CLASS_LABEL] = df['Tags'].str.contains('machine-learning').fillna(False)
- train_df, test_df = train_test_split(df, random_state=random_state, stratify=df[CLASS_LABEL])
- print('Saving split data...')
- train_df.to_csv(train_df_path)
- test_df.to_csv(test_df_path)
- def train():
- print('Loading data...')
- train_df = pd.read_csv(train_df_path)
- test_df = pd.read_csv(test_df_path)
- print('Engineering features...')
- train_df = feature_engineering(train_df)
- test_df = feature_engineering(test_df)
- exp_id = get_or_create_experiment_id("tutorial")
- with mlflow.start_run(experiment_id=exp_id):
- print('Fitting TFIDF...')
- train_tfidf, test_tfidf, tfidf = fit_tfidf(train_df, test_df)
- print('Saving TFIDF object...')
- joblib.dump(tfidf, 'outputs/tfidf.joblib')
- mlflow.log_params({f'tfidf__{k}': v for k, v in tfidf.get_params().items()})
- print('Training model...')
- train_y = train_df[CLASS_LABEL]
- model = fit_model(train_tfidf, train_y)
- print('Saving trained model...')
- joblib.dump(model, 'outputs/model.joblib')
- mlflow.log_param("model_class", type(model).__name__)
- mlflow.log_params({f'model__{k}': v for k, v in model.get_params().items()})
- print('Evaluating model...')
- train_metrics = eval_model(model, train_tfidf, train_y)
- print('Train metrics:')
- print(train_metrics)
- mlflow.log_metrics({f'train__{k}': v for k,v in train_metrics.items()})
- test_metrics = eval_model(model, test_tfidf, test_df[CLASS_LABEL])
- print('Test metrics:')
- print(test_metrics)
- mlflow.log_metrics({f'test__{k}': v for k,v in test_metrics.items()})
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- subparsers = parser.add_subparsers(title='Split or Train step:', dest='step')
- subparsers.required = True
- split_parser = subparsers.add_parser('split')
- split_parser.set_defaults(func=split)
- train_parser = subparsers.add_parser('train')
- train_parser.set_defaults(func=train)
- parser.parse_args().func()
|