Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

replace_nan.py 2.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2021. Jeffrey J. Nirschl. All rights reserved.
  3. #
  4. # Licensed under the MIT license. See the LICENSE.md file in the project
  5. # root directory for full license information.
  6. #
  7. # Time-stamp: <>
  8. # ======================================================================
  9. import argparse
  10. import os
  11. from pathlib import Path
  12. import yaml
  13. from src.data import load_data, load_params, save_as_csv
  14. def main(train_path, test_path,
  15. output_dir):
  16. """Split data into train, dev, and test"""
  17. output_dir = Path(output_dir).resolve()
  18. assert (os.path.isdir(output_dir)), NotADirectoryError
  19. # load data
  20. train_df, test_df = load_data([train_path, test_path], sep=",", header=0,
  21. index_col="PassengerId")
  22. # load params
  23. params = load_params()
  24. # fill nans with column mean/mode on test set
  25. # TODO - switch to allow for different interpolation methods (e.g., mean, median, MICE)
  26. if params["imputation"]["method"].lower() == "mean":
  27. mean_age = float(round(train_df["Age"].mean(), 4))
  28. mean_fare = float(round(train_df["Fare"].mean(), 4))
  29. train_df["Age"].fillna(value=mean_age,
  30. inplace=True)
  31. test_df["Age"].fillna(value=mean_age,
  32. inplace=True)
  33. test_df["Fare"].fillna(value=mean_fare,
  34. inplace=True)
  35. # update params and save imputation scheme
  36. params["imputation"]["Age"] = mean_age
  37. params["imputation"]["Fare"] = mean_fare
  38. elif params["imputation"]["method"].lower() == "mice":
  39. # TODO MICE interpolation
  40. raise NotImplementedError
  41. else:
  42. raise NotImplementedError
  43. # update params
  44. new_params = yaml.safe_dump(params)
  45. with open("params.yaml", "w") as writer:
  46. writer.write(new_params)
  47. # save data
  48. save_as_csv([train_df, test_df],
  49. [train_path, test_path],
  50. output_dir,
  51. replace_text="_categorized.csv",
  52. suffix="_nan_imputed.csv",
  53. na_rep="nan")
  54. if __name__ == "__main__":
  55. parser = argparse.ArgumentParser()
  56. parser.add_argument("-tr", "--train", dest="train_path",
  57. required=True, help="Train CSV file")
  58. parser.add_argument("-te", "--test", dest="test_path",
  59. required=True, help="Test CSV file")
  60. parser.add_argument("-o", "--out-dir", dest="output_dir",
  61. default=Path("./data/interim").resolve(),
  62. required=False, help="output directory")
  63. args = parser.parse_args()
  64. # convert categorical variables into integer codes
  65. main(args.train_path, args.test_path,
  66. args.output_dir)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...