Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

train_tfidf.py 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
  1. import joblib
  2. import numpy as np
  3. import pandas as pd
  4. from news_cat.config import get_app_settings
  5. from news_cat.ml.config import MLConfig
  6. from news_cat.ml.embedding import tfidf_train
  7. def vectorize():
  8. base = get_app_settings().data_dir
  9. print("Loading data...")
  10. train_df = pd.read_feather(base.joinpath(MLConfig.train_df))
  11. test_df = pd.read_feather(base.joinpath(MLConfig.test_df))
  12. valid_df = pd.read_feather(base.joinpath(MLConfig.valid_df))
  13. max_features = 25_000
  14. txt_col = "clean_txt"
  15. lbl_col = "category"
  16. print(f"Training Tf-Idf vectorizer with {max_features} max features...")
  17. vectorizer, trainX = tfidf_train(
  18. train_df[txt_col].values, max_features=max_features
  19. )
  20. print("Transforming test and valid sets...")
  21. testX = vectorizer.transform(test_df[txt_col].values)
  22. validX = vectorizer.transform(valid_df[txt_col].values)
  23. print("Saving vectorizer, vectorized data and labels...")
  24. artifact_dir = get_app_settings().artifact_dir
  25. joblib.dump(vectorizer, artifact_dir.joinpath(MLConfig.embedding.tfidf_vectorizer))
  26. np.save(base.joinpath(MLConfig.embedding.train_tfidf), trainX)
  27. np.save(base.joinpath(MLConfig.embedding.test_tfidf), testX)
  28. np.save(base.joinpath(MLConfig.embedding.valid_tfidf), validX)
  29. np.save(base.joinpath(MLConfig.embedding.trainY), train_df[lbl_col].values)
  30. np.save(base.joinpath(MLConfig.embedding.testY), test_df[lbl_col].values)
  31. np.save(base.joinpath(MLConfig.embedding.validY), valid_df[lbl_col].values)
  32. print("Done...")
  33. if __name__ == "__main__":
  34. vectorize()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...