Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurize.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
  1. import hydra
  2. import pandas as pd
  3. from omegaconf.omegaconf import OmegaConf
  4. from typing import Text
  5. from imblearn.over_sampling import RandomOverSampler
  6. from sklearn.preprocessing import RobustScaler
  7. @hydra.main(config_path="../configs", config_name="configs")
  8. def featurize(cfg: Text) -> None:
  9. OmegaConf.to_yaml(cfg, resolve=True)
  10. """Create new features.
  11. Args:
  12. cfg {Text}: path to config
  13. """
  14. # ingest
  15. train = pd.read_csv(cfg.train.trainset_path)
  16. test = pd.read_csv(cfg.train.testset_path)
  17. # features train
  18. x_train = train.drop(['fetal_health'], axis=1)
  19. n_sample, n_featrues = x_train.shape
  20. y_train = train['fetal_health']
  21. ros = RandomOverSampler(random_state=42)
  22. x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
  23. # features test
  24. x_test = test.drop(['fetal_health'], axis=1)
  25. y_test = test['fetal_health']
  26. # Scale train/ test predictors based on training data
  27. ro_scaler = RobustScaler().fit(x_resampled)
  28. #x_train_scaled = ro_scaler.transform(x_resampled)
  29. x_train_scaled = pd.DataFrame(ro_scaler.transform(x_resampled), columns=x_resampled.columns, index=x_resampled.index)
  30. #x_test_scaled = ro_scaler.transform(x_test)
  31. x_test_scaled = pd.DataFrame(ro_scaler.transform(x_test), columns=x_test.columns, index=x_test.index)
  32. idx = {1:0, 2:1, 3:2}
  33. # combine features and targets
  34. new_train = x_train_scaled
  35. new_train['fetal_health'] = y_resampled.map(idx)
  36. new_test = x_test_scaled
  37. new_test['fetal_health'] = y_test.map(idx)
  38. # save
  39. train_path = cfg.train.features_train_path
  40. test_path = cfg.train.features_test_path
  41. new_train.to_csv(train_path, index=False)
  42. new_test.to_csv(test_path, index=False)
  43. if __name__ == '__main__':
  44. featurize()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...