add_files

danigrim 7 months ago
commit
635e57a765
11 changed files with 1058647 additions and 0 deletions
  1. 11
    0
      README.md
  2. 32
    0
      code/data-preprocessing.py
  3. 16
    0
      code/modeling.py
  4. 7780
    0
      data/X_test.csv
  5. 31113
    0
      data/X_train.csv
  6. 980766
    0
      data/enron.csv
  7. 7780
    0
      data/y_test.csv
  8. 31113
    0
      data/y_train.csv
  9. 10
    0
      requirements.txt
  10. BIN
      src/.DS_Store
  11. 26
    0
      src/const.py
@@ -0,0 +1,11 @@
+# First Repo Project
+
+This project is a simple 'Ham or Spam' classifier for emails using the Enron data set. It contains two python code files, 5 data files, and one constants file.
+
+- code directory - holds the data-preprocessing and modeling files:
+    - data-preprocessing.py - processing the raw data (enron.csv), splits it to train and test sets, and saves it to the data directory.
+    - modeling.py - simple Random Forest Regressor.
+- data directory - contains the raw and processed data.
+- src - contains the constants file.
+- requirements.txt - python dependencies that are required to run the python files.
+- README.md - Read me file.
@@ -0,0 +1,32 @@
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.model_selection import train_test_split
+from src.const import *
+import string
+
+print(M_PRO_INIT, '\n' + M_PRO_LOAD_DATA)
+data = pd.read_csv(RAW_DATA_PATH)
+
+print(M_PRO_RMV_PUNC)
+clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''). \
+                                     translate(str.maketrans('', '', string.punctuation)))
+
+print(M_PRO_LE)
+y = data[TARGET_COL].map({CLASS_0: 0, CLASS_1: 1})
+
+print(M_PRO_VEC)
+# every column is 1-2 words and the value is the number of appearance in Email
+email_text_list = clean_text.tolist()
+vectorizer = CountVectorizer(encoding='utf-8', decode_error='ignore', stop_words='english',
+                             analyzer='word', ngram_range=(1, 2), max_features=500)
+X_sparse = vectorizer.fit_transform(email_text_list)
+X = pd.DataFrame(X_sparse.toarray(), columns=vectorizer.get_feature_names())
+
+print(M_PRO_SPLIT_DATA)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
+
+print(M_PRO_SAVE_DATA)
+X_train.to_csv(X_TRAIN_PATH, index=False)
+X_test.to_csv(X_TEST_PATH, index=False)
+y_train.to_csv(Y_TRAIN_PATH, index=False)
+y_test.to_csv(Y_TEST_PATH, index=False)
@@ -0,0 +1,16 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import roc_auc_score
+import pandas as pd
+from src.const import *
+
+print(M_MOD_INIT,'\n'+M_MOD_LOAD_DATA)
+X_train = pd.read_csv(X_TRAIN_PATH)
+X_test = pd.read_csv(X_TEST_PATH)
+y_train = pd.read_csv(Y_TRAIN_PATH)
+y_test = pd.read_csv(Y_TEST_PATH)
+
+print(M_MOD_RFC)
+rfc = RandomForestClassifier(n_estimators=1, random_state=0)
+rfc.fit(X_train, y_train.values.ravel())
+y_pred = rfc.predict(X_test)
+print(M_MOD_SCORE, round(roc_auc_score(y_test, y_pred),3))
@@ -0,0 +1,10 @@
+joblib==1.0.1
+numpy==1.20.1
+pandas==1.2.2
+python-dateutil==2.8.1
+pytz==2021.1
+scikit-learn==0.24.1
+scipy==1.6.0
+six==1.15.0
+sklearn==0.0
+threadpoolctl==2.1.0
@@ -0,0 +1,26 @@
+# Values
+TEXT_COL_NAME = 'text'
+TARGET_COL = 'label'
+CLASS_0 = 'ham'
+CLASS_1 = 'spam'
+
+# Path
+RAW_DATA_PATH = '../data/enron.csv'
+X_TRAIN_PATH = '../data/X_train.csv'
+X_TEST_PATH = '../data/X_test.csv'
+Y_TRAIN_PATH = '../data/y_train.csv'
+Y_TEST_PATH = '../data/y_test.csv'
+
+# Messages
+M_PRO_INIT = '[DEBUG] Preprocessing raw data'
+M_PRO_LOAD_DATA = '     [DEBUG] Loading raw data'
+M_PRO_RMV_PUNC = '     [DEBUG] Removing punctuation from Emails'
+M_PRO_LE = '     [DEBUG] Label encoding target column'
+M_PRO_VEC = '     [DEBUG] vectorizing the emails by words'
+M_PRO_SPLIT_DATA = '     [DEBUG] Splitting data to train and test'
+M_PRO_SAVE_DATA = '     [DEBUG] Saving data to file'
+
+M_MOD_INIT = '[DEBUG] Initialize Modeling'
+M_MOD_LOAD_DATA = '     [DEBUG] Loading data sets for modeling'
+M_MOD_RFC ='     [DEBUG] Runing Random Forest Classifier'
+M_MOD_SCORE = '     [INFO] Finished modeling with AUC Score:'