Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

process_data.py 2.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  1. import pandas as pd
  2. from dagshub.logger import DAGsHubLogger
  3. from omegaconf import DictConfig
  4. from sklearn.preprocessing import StandardScaler
  5. import hydra
  6. def load_data(data_name: str) -> pd.DataFrame:
  7. data = pd.read_csv(data_name)
  8. return data
  9. def drop_na(df: pd.DataFrame) -> pd.DataFrame:
  10. return df.dropna()
  11. def get_age(df: pd.DataFrame) -> pd.DataFrame:
  12. return df.assign(age=df["Year_Birth"].apply(lambda row: 2021 - row))
  13. def get_total_children(df: pd.DataFrame) -> pd.DataFrame:
  14. return df.assign(total_children=df["Kidhome"] + df["Teenhome"])
  15. def get_total_purchases(df: pd.DataFrame) -> pd.DataFrame:
  16. purchases_columns = df.filter(like="Purchases", axis=1).columns
  17. return df.assign(total_purchases=df[purchases_columns].sum(axis=1))
  18. def get_enrollment_years(df: pd.DataFrame) -> pd.DataFrame:
  19. df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
  20. return df.assign(enrollment_years=2022 - df["Dt_Customer"].dt.year)
  21. def get_family_size(df: pd.DataFrame, size_map: dict) -> pd.DataFrame:
  22. return df.assign(
  23. family_size=df["Marital_Status"].map(size_map) + df["total_children"]
  24. )
  25. def drop_features(df: pd.DataFrame, keep_columns: list):
  26. df = df[keep_columns]
  27. return df
  28. def drop_outliers(df: pd.DataFrame, column_threshold: dict):
  29. for col, threshold in column_threshold.items():
  30. df = df[df[col] < threshold]
  31. return df.reset_index(drop=True)
  32. def drop_columns_and_rows(
  33. df: pd.DataFrame, keep_columns: dict, remove_outliers_threshold: dict
  34. ) -> pd.DataFrame:
  35. df = df.pipe(drop_features, keep_columns=keep_columns).pipe(
  36. drop_outliers, column_threshold=remove_outliers_threshold
  37. )
  38. return df
  39. def get_scaler(df: pd.DataFrame):
  40. scaler = StandardScaler()
  41. scaler.fit(df)
  42. return scaler
  43. def scale_features(df: pd.DataFrame, scaler: StandardScaler):
  44. return pd.DataFrame(scaler.transform(df), columns=df.columns)
  45. @hydra.main(
  46. config_path="../config",
  47. config_name="main",
  48. )
  49. def process_data(config: DictConfig):
  50. df = load_data(config.raw_data.path)
  51. df = drop_na(df)
  52. df = get_age(df)
  53. df = get_total_children(df)
  54. df = get_total_purchases(df)
  55. df = get_enrollment_years(df)
  56. df = get_family_size(df, config.process.family_size)
  57. df = drop_columns_and_rows(
  58. df,
  59. config.process.keep_columns,
  60. config.process.remove_outliers_threshold,
  61. )
  62. scaler = get_scaler(df)
  63. df = scale_features(df, scaler)
  64. df.to_csv(config.intermediate.path, index=False)
  65. if __name__ == "__main__":
  66. process_data()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...