Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 1.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. import us
  5. from sklearn.metrics import fbeta_score
  6. from sklearn.model_selection import train_test_split
  7. def handle_time(X):
  8. X['utc_time'] = pd.to_datetime(X['utc_time'], unit='ms', utc=True)
  9. X['local_tz'] = X['user_state'].apply(lambda x: us.states.lookup(x).capital_tz)
  10. X['local_time'] = X.groupby('local_tz')['utc_time'].transform(lambda x: x.dt.tz_convert(x.name))
  11. X['utc_time'] = X['utc_time'].dt.tz_localize(None)
  12. X['hour'] = X['utc_time'].dt.hour
  13. X['day'] = X['utc_time'].dt.day
  14. X['month'] = X['utc_time'].dt.month
  15. return X
  16. def preprocessing():
  17. X = pd.read_csv('data\\X_train.csv')
  18. y = pd.read_json('data\\y_train.json')
  19. X_test = pd.read_csv('data\\X_test.csv')
  20. for c in ['user_isp', 'device_maker', 'device_model']:
  21. X[c] = X[c].fillna('Unknown')
  22. features = handle_time(X).drop(['bidid', 'utc_time', 'marketplace', 'local_tz', 'local_time'], axis=1)
  23. target = y
  24. X_test = handle_time(X_test).drop(['bidid', 'utc_time', 'marketplace', 'local_tz', 'local_time'], axis=1)
  25. X_train, X_val, y_train, y_val = train_test_split(features, target)
  26. np.save('processed_data\\X_train_processed',X_train )
  27. np.save('processed_data\\X_val_processed', X_val)
  28. np.save('processed_data\\y_train_processed', y_train)
  29. np.save('processed_data\\y_val_processed', y_val)
  30. np.save('processed_data\\X_test', X_test)
  31. if __name__ == "__main__":
  32. preprocessing()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...