Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

process_data.py 2.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  1. import hydra
  2. import pandas as pd
  3. from omegaconf import DictConfig
  4. from sklearn.preprocessing import StandardScaler
  5. def load_data(data_name: str) -> pd.DataFrame:
  6. return pd.read_csv(data_name)
  7. def drop_na(df: pd.DataFrame) -> pd.DataFrame:
  8. return df.dropna()
  9. def get_age(df: pd.DataFrame) -> pd.DataFrame:
  10. return df.assign(age=df["Year_Birth"].apply(lambda row: 2021 - row))
  11. def get_total_children(df: pd.DataFrame) -> pd.DataFrame:
  12. return df.assign(total_children=df["Kidhome"] + df["Teenhome"])
  13. def get_total_purchases(df: pd.DataFrame) -> pd.DataFrame:
  14. purchases_columns = df.filter(like="Purchases", axis=1).columns
  15. return df.assign(total_purchases=df[purchases_columns].sum(axis=1))
  16. def get_enrollment_years(df: pd.DataFrame) -> pd.DataFrame:
  17. df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], format="%d-%m-%Y")
  18. return df.assign(enrollment_years=2022 - df["Dt_Customer"].dt.year)
  19. def get_family_size(df: pd.DataFrame, size_map: dict) -> pd.DataFrame:
  20. return df.assign(family_size=df["Marital_Status"].map(size_map) + df["total_children"])
  21. def drop_features(df: pd.DataFrame, keep_columns: list):
  22. df = df[keep_columns]
  23. return df
  24. def drop_outliers(df: pd.DataFrame, column_threshold: dict):
  25. for col, threshold in column_threshold.items():
  26. df = df[df[col] < threshold]
  27. return df.reset_index(drop=True)
  28. def drop_columns_and_rows(df: pd.DataFrame, keep_columns: list, remove_outliers_threshold: dict):
  29. return df.pipe(drop_features, keep_columns=keep_columns).pipe(
  30. drop_outliers, column_threshold=remove_outliers_threshold
  31. )
  32. def scale_features(df: pd.DataFrame):
  33. scaler = StandardScaler()
  34. return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
  35. @hydra.main(config_path="../config", config_name="main")
  36. def process_data(config: DictConfig):
  37. process_config = config.process
  38. df = load_data(config.raw_data.path)
  39. df = drop_na(df)
  40. df = get_age(df)
  41. df = get_total_children(df)
  42. df = get_total_purchases(df)
  43. df = get_enrollment_years(df)
  44. df = get_family_size(df, process_config.family_size)
  45. df = drop_columns_and_rows(
  46. df,
  47. process_config.keep_columns,
  48. process_config.remove_outliers_threshold,
  49. )
  50. df = scale_features(df)
  51. df.to_csv(config.intermediate.path, index=False)
  52. if __name__ == "__main__":
  53. process_data()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...