Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurization.py 1.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  1. #######################################################
  2. # Receives train and test *.csv sets and generates features for each of them
  3. # Returns train and test *.joblib sets with new features
  4. import pandas as pd
  5. import os
  6. import errno
  7. import sys
  8. import joblib
  9. def mkdir_p(path):
  10. try:
  11. os.makedirs(path)
  12. except OSError as exc: # Python >2.5
  13. if exc.errno == errno.EEXIST and os.path.isdir(path):
  14. pass
  15. else:
  16. raise
  17. def get_df(path):
  18. df = pd.read_csv(path)
  19. sys.stderr.write('The input data frame {} size is {}\n'.format(path, df.shape))
  20. return df
  21. def featurize(train_path, test_path):
  22. """
  23. Generates features for the train and test sets
  24. # TODO: for now it's only the identity function to stage the pipeline
  25. :param train_path: path to input train dataset, csv file
  26. :param test_path: path to input test dataset, csv file
  27. :return: tuple, (train, test), featurized pandas dataframes
  28. """
  29. df_train = get_df(train_path)
  30. features_train = df_train
  31. df_test = get_df(test_path)
  32. features_test = df_test
  33. return features_train, features_test
  34. if __name__ == "__main__":
  35. if len(sys.argv) != 3 and len(sys.argv) != 5:
  36. sys.stderr.write('Arguments error. Usage:\n')
  37. sys.stderr.write('\tpython featurization.py data-dir-path features-dir-path\n')
  38. sys.exit(1)
  39. mkdir_p(sys.argv[2])
  40. train_input = os.path.join(sys.argv[1], 'train.joblib')
  41. test_input = os.path.join(sys.argv[1], 'test.joblib')
  42. train_output = os.path.join(sys.argv[2], 'train.joblib')
  43. test_output = os.path.join(sys.argv[2], 'test.joblib')
  44. # try:
  45. # reload(sys)
  46. # sys.setdefaultencoding('utf-8')
  47. # except NameError:
  48. # pass
  49. train_features, test_features = featurize(train_input, test_input)
  50. joblib.dump(train_features, train_output)
  51. joblib.dump(test_features, test_output)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...