1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
- import os
- import pandas as pd
- from sklearn.feature_extraction.text import CountVectorizer
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import roc_auc_score
- import dagshub
- import string
- print('[DEBUG] Starting up machine learning pipeline')
- PREFIX = ".." if os.path.dirname(os.getcwd()) == "src" else ""
- # Values
- TEXT_COL_NAME = 'text'
- TARGET_COL = 'label'
- CLASS_0 = 'ham'
- CLASS_1 = 'spam'
- # Path
- RAW_DATA_PATH = os.path.join(PREFIX, 'data/enron.csv')
- X_TRAIN_PATH = os.path.join(PREFIX, 'data/X_train.csv')
- X_TEST_PATH = os.path.join(PREFIX, 'data/X_test.csv')
- Y_TRAIN_PATH = os.path.join(PREFIX, 'data/y_train.csv')
- Y_TEST_PATH = os.path.join(PREFIX, 'data/y_test.csv')
- print('[DEBUG] Preprocessing raw data', '\n' + ' [DEBUG] Loading raw data')
- data = pd.read_csv(RAW_DATA_PATH)
- print(' [DEBUG] Removing punctuation from Emails')
- clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''). \
- translate(str.maketrans('', '', string.punctuation)))
- print(' [DEBUG] Label encoding target column')
- y = data[TARGET_COL].map({CLASS_0: 0, CLASS_1: 1})
- print(' [DEBUG] vectorizing the emails by words')
- # every column is 1-2 words and the value is the number of appearance in Email
- email_text_list = clean_text.tolist()
- vectorizer = CountVectorizer(encoding='utf-8', decode_error='ignore', stop_words='english',
- analyzer='word', ngram_range=(1, 2), max_features=500)
- X_sparse = vectorizer.fit_transform(email_text_list)
- X = pd.DataFrame(X_sparse.toarray(), columns=vectorizer.get_feature_names_out())
- print(' [DEBUG] Splitting data to train and test')
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
- print(' [DEBUG] Saving data to file')
- X_train.to_csv(X_TRAIN_PATH, index=False)
- X_test.to_csv(X_TEST_PATH, index=False)
- y_train.to_csv(Y_TRAIN_PATH, index=False)
- y_test.to_csv(Y_TEST_PATH, index=False)
- print('[DEBUG] Initialize Modeling', '\n'+' [DEBUG] Loading data sets for modeling')
- X_train = pd.read_csv(X_TRAIN_PATH)
- X_test = pd.read_csv(X_TEST_PATH)
- y_train = pd.read_csv(Y_TRAIN_PATH)
- y_test = pd.read_csv(Y_TEST_PATH)
- print(' [DEBUG] Runing Random Forest Classifier')
- with dagshub.dagshub_logger() as logger:
- rfc = RandomForestClassifier(n_estimators=1, random_state=0)
- # log the model's parameters
- logger.log_hyperparams(model_class=type(rfc).__name__)
- logger.log_hyperparams({'model': rfc.get_params()})
- # Train the model
- rfc.fit(X_train, y_train.values.ravel())
- y_pred = rfc.predict(X_test)
- # log the model's performances
- logger.log_metrics({f'roc_auc_score':round(roc_auc_score(y_test, y_pred),3)})
- print(' [INFO] Finished modeling with AUC Score:', round(roc_auc_score(y_test, y_pred),3))
|