Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

code.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
  1. import os
  2. import pandas as pd
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.ensemble import RandomForestClassifier
  6. from sklearn.metrics import roc_auc_score
  7. import dagshub
  8. import string
  9. print('[DEBUG] Starting up machine learning pipeline')
  10. PREFIX = ".." if os.path.dirname(os.getcwd()) == "src" else ""
  11. # Values
  12. TEXT_COL_NAME = 'text'
  13. TARGET_COL = 'label'
  14. CLASS_0 = 'ham'
  15. CLASS_1 = 'spam'
  16. # Path
  17. RAW_DATA_PATH = os.path.join(PREFIX, 'data/enron.csv')
  18. X_TRAIN_PATH = os.path.join(PREFIX, 'data/X_train.csv')
  19. X_TEST_PATH = os.path.join(PREFIX, 'data/X_test.csv')
  20. Y_TRAIN_PATH = os.path.join(PREFIX, 'data/y_train.csv')
  21. Y_TEST_PATH = os.path.join(PREFIX, 'data/y_test.csv')
  22. print('[DEBUG] Preprocessing raw data', '\n' + ' [DEBUG] Loading raw data')
  23. data = pd.read_csv(RAW_DATA_PATH)
  24. print(' [DEBUG] Removing punctuation from Emails')
  25. clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''). \
  26. translate(str.maketrans('', '', string.punctuation)))
  27. print(' [DEBUG] Label encoding target column')
  28. y = data[TARGET_COL].map({CLASS_0: 0, CLASS_1: 1})
  29. print(' [DEBUG] vectorizing the emails by words')
  30. # every column is 1-2 words and the value is the number of appearance in Email
  31. email_text_list = clean_text.tolist()
  32. vectorizer = CountVectorizer(encoding='utf-8', decode_error='ignore', stop_words='english',
  33. analyzer='word', ngram_range=(1, 2), max_features=500)
  34. X_sparse = vectorizer.fit_transform(email_text_list)
  35. X = pd.DataFrame(X_sparse.toarray(), columns=vectorizer.get_feature_names_out())
  36. print(' [DEBUG] Splitting data to train and test')
  37. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
  38. print(' [DEBUG] Saving data to file')
  39. X_train.to_csv(X_TRAIN_PATH, index=False)
  40. X_test.to_csv(X_TEST_PATH, index=False)
  41. y_train.to_csv(Y_TRAIN_PATH, index=False)
  42. y_test.to_csv(Y_TEST_PATH, index=False)
  43. print('[DEBUG] Initialize Modeling', '\n'+' [DEBUG] Loading data sets for modeling')
  44. X_train = pd.read_csv(X_TRAIN_PATH)
  45. X_test = pd.read_csv(X_TEST_PATH)
  46. y_train = pd.read_csv(Y_TRAIN_PATH)
  47. y_test = pd.read_csv(Y_TEST_PATH)
  48. print(' [DEBUG] Runing Random Forest Classifier')
  49. with dagshub.dagshub_logger() as logger:
  50. rfc = RandomForestClassifier(n_estimators=1, random_state=0)
  51. # log the model's parameters
  52. logger.log_hyperparams(model_class=type(rfc).__name__)
  53. logger.log_hyperparams({'model': rfc.get_params()})
  54. # Train the model
  55. rfc.fit(X_train, y_train.values.ravel())
  56. y_pred = rfc.predict(X_test)
  57. # log the model's performances
  58. logger.log_metrics({f'roc_auc_score':round(roc_auc_score(y_test, y_pred),3)})
  59. print(' [INFO] Finished modeling with AUC Score:', round(roc_auc_score(y_test, y_pred),3))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...