data-preprocessing.py 1.1 KB

12345678910111213141516171819202122232425262728293031
  1. import pandas as pd
  2. from sklearn.feature_extraction.text import CountVectorizer
  3. from sklearn.model_selection import train_test_split
  4. from const import *
  5. print(M_PRO_INIT, '\n' + M_PRO_LOAD_DATA)
  6. data = pd.read_csv(RAW_DATA_PATH)
  7. print(M_PRO_RMV_PUNC)
  8. clean_text = data[TEXT_COL_NAME].map(lambda x: x.lower().replace('\n', ''))
  9. print(M_PRO_LE)
  10. y = data[TARGET_COL].map({CLASS_0: 0, CLASS_1: 1})
  11. print(M_PRO_VEC)
  12. # every column is 1-2 words and the value is the number of appearance in Email
  13. email_text_list = clean_text.tolist()
  14. vectorizer = CountVectorizer(encoding='utf-8', decode_error='ignore', stop_words='english',
  15. analyzer='word', ngram_range=(1, 2), max_features=500)
  16. X_sparse = vectorizer.fit_transform(email_text_list)
  17. X = pd.DataFrame(X_sparse.toarray(), columns=vectorizer.get_feature_names())
  18. print(M_PRO_SPLIT_DATA)
  19. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
  20. print(M_PRO_SAVE_DATA)
  21. X_train.to_csv(X_TRAIN_PATH, index=False)
  22. X_test.to_csv(X_TEST_PATH, index=False)
  23. y_train.to_csv(Y_TRAIN_PATH, index=False)
  24. y_test.to_csv(Y_TEST_PATH, index=False)
Tip!

Press p or to see the previous file or, n or to see the next file