Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

feature_engineering.py 1.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
  1. import os
  2. import pandas as pd
  3. import plac
  4. from sklearn.decomposition import PCA
  5. from utils import log_experiment, save_results, read_data, read_params
  6. @plac.annotations(
  7. data_path=("Path to source data", "option", "i", str),
  8. feature_path=("Path to save featurized data", "option", "f", str),
  9. out_path=("Path to save pca model", "option", "o", str)
  10. )
  11. def main(data_path='data/split/', feature_path='data/features/', out_path='data/pca/'):
  12. X_train, X_test, y_train, y_test = read_data(data_path)
  13. params = read_params('params.yaml', 'pca')
  14. pca = PCA(**params).fit(X_train)
  15. train_feature = pd.DataFrame(pca.transform(X_train))
  16. test_feature = pd.DataFrame(pca.transform(X_test))
  17. train_feature['class'] = y_train
  18. test_feature['class'] = y_test
  19. if not os.path.isdir(feature_path):
  20. os.mkdir(feature_path)
  21. train_feature.to_csv(f'{feature_path}train.csv', index=False)
  22. test_feature.to_csv(f'{feature_path}test.csv', index=False)
  23. save_results(out_path, pca, None)
  24. print(f'Finished Feature Engineering:\nStats:')
  25. print(f'\tExplained Variance: {pca.explained_variance_}')
  26. print(f'\tExplained Variance Ratio: {pca.explained_variance_ratio_}')
  27. log_experiment(out_path, params=params,
  28. metrics=dict(explained_variance_=pca.explained_variance_,
  29. explained_variance_ratio_=pca.explained_variance_ratio_))
  30. if __name__ == '__main__':
  31. plac.call(main)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...