Guy
/
hello-dagshub


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
            import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import dagshub
import string

print('[DEBUG] Starting up machine learning pipeline')
PREFIX = ".." if os.path.dirname(os.getcwd()) == "src" else ""

# Values
TEXT_COL_NAME = 'text'
TARGET_COL = 'label'
CLASS_0 = 'ham'
CLASS_1 = 'spam'

# Path
RAW_DATA_PATH = os.path.join(PREFIX, 'data/enron.csv')
X_TRAIN_PATH = os.path.join(PREFIX, 'data/X_train.csv')
X_TEST_PATH = os.path.join(PREFIX, 'data/X_test.csv')
Y_TRAIN_PATH = os.path.join(PREFIX, 'data/y_train.csv')
Y_TEST_PATH = os.path.join(PREFIX, 'data/y_test.csv')

print('[DEBUG] Preprocessing raw data', '\n' + '     [DEBUG] Loading raw data')
data = pd.read_csv(RAW_DATA_PATH)

print('     [DEBUG] Removing punctuation from Emails')
clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''). \
                                     translate(str.maketrans('', '', string.punctuation)))

print('     [DEBUG] Label encoding target column')
y = data[TARGET_COL].map({CLASS_0: 0, CLASS_1: 1})

print('     [DEBUG] vectorizing the emails by words')
# every column is 1-2 words and the value is the number of appearance in Email
email_text_list = clean_text.tolist()
vectorizer = CountVectorizer(encoding='utf-8', decode_error='ignore', stop_words='english',
                             analyzer='word', ngram_range=(1, 2), max_features=500)
X_sparse = vectorizer.fit_transform(email_text_list)
X = pd.DataFrame(X_sparse.toarray(), columns=vectorizer.get_feature_names_out())

print('     [DEBUG] Splitting data to train and test')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print('     [DEBUG] Saving data to file')
X_train.to_csv(X_TRAIN_PATH, index=False)
X_test.to_csv(X_TEST_PATH, index=False)
y_train.to_csv(Y_TRAIN_PATH, index=False)
y_test.to_csv(Y_TEST_PATH, index=False)

print('[DEBUG] Initialize Modeling', '\n'+'     [DEBUG] Loading data sets for modeling')
X_train = pd.read_csv(X_TRAIN_PATH)
X_test = pd.read_csv(X_TEST_PATH)
y_train = pd.read_csv(Y_TRAIN_PATH)
y_test = pd.read_csv(Y_TEST_PATH)

print('     [DEBUG] Runing Random Forest Classifier')
with dagshub.dagshub_logger() as logger:
    rfc = RandomForestClassifier(n_estimators=1, random_state=0)
    # log the model's parameters
    logger.log_hyperparams(model_class=type(rfc).__name__)
    logger.log_hyperparams({'model': rfc.get_params()})

    # Train the model
    rfc.fit(X_train, y_train.values.ravel())
    y_pred = rfc.predict(X_test)

    # log the model's performances
    logger.log_metrics({f'roc_auc_score':round(roc_auc_score(y_test, y_pred),3)})
    print('     [INFO] Finished modeling with AUC Score:', round(roc_auc_score(y_test, y_pred),3))