Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurize.py 2.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
  1. import os
  2. import sys
  3. import yaml
  4. import numpy as np
  5. import pandas as pd
  6. import sys
  7. import pickle
  8. from sklearn.impute import KNNImputer
  9. params = yaml.safe_load(open('params.yaml'))['ft']
  10. np.set_printoptions(suppress=True)
  11. if len(sys.argv) != 3:
  12. sys.stderr.write('Argument error. Usage:\n')
  13. sys.stderr.write('\tpython featurization.py data-dit-path feature-dir-path\n')
  14. sys.exit(1)
  15. train_input = os.path.join(sys.argv[1], 'train.csv')
  16. test_input = os.path.join(sys.argv[1], 'test.csv')
  17. train_output = os.path.join(sys.argv[2], 'ks_train.csv')
  18. test_output = os.path.join(sys.argv[2], 'ks_test.csv')
  19. n_neigh = params['n_neigh']
  20. train_df = pd.read_csv(train_input)
  21. test_df = pd.read_csv(test_input)
  22. col_mask = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
  23. ms_train_df = train_df.loc[:, col_mask]
  24. ms_train_df['target'] = train_df['Survived']
  25. ms_test_df = test_df.loc[:, col_mask]
  26. # Label Encoding
  27. def cat_to_int(df, columns, enc={}):
  28. df = df.copy()
  29. if enc == {}:
  30. maps = {}
  31. for col in columns:
  32. mapping = {k: i for i,k in enumerate(df.loc[:,col].unique())}
  33. df[col] = df[col].map(mapping)
  34. maps[col] = mapping
  35. return df, maps
  36. else:
  37. maps = enc
  38. for col in columns:
  39. df[col] = df[col].map(maps[col])
  40. return df
  41. enc_train_df, enc_map = cat_to_int(ms_train_df, ['Sex', 'Embarked'])
  42. enc_test_df = cat_to_int(ms_test_df, ['Sex', 'Embarked'], enc_map)
  43. # Data Imputation
  44. # drop EMbarked missing value records
  45. fl_train_df = enc_train_df.dropna(subset=['Embarked'])
  46. fl_test_df = enc_test_df.dropna(subset=['Embarked'])
  47. # knn imputation for age
  48. imputer = KNNImputer(n_neighbors=n_neigh, weights="uniform")
  49. imp_train_df = pd.DataFrame(imputer.fit_transform(fl_train_df.iloc[:,:-1]), columns=col_mask)
  50. imp_test_df = pd.DataFrame(imputer.transform(fl_test_df), columns=col_mask)
  51. imp_train_df['target'] = fl_train_df.iloc[:,-1]
  52. imp_train_df.to_csv(train_output, index=False)
  53. imp_test_df.to_csv(test_output, index=False)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...