Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocessing.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.preprocessing import LabelEncoder
  4. import os
  5. import errno
  6. import sys
  7. OUTPUT = os.path.join('data', 'preprocessed')
  8. split = 0.2
  9. random_seed = 42
  10. def mkdir_p(path):
  11. try:
  12. os.makedirs(path)
  13. except OSError as exc: # Python >2.5
  14. if exc.errno == errno.EEXIST and os.path.isdir(path):
  15. pass
  16. else:
  17. raise
  18. def preprocess(path):
  19. input = pd.read_csv(path, header=None)
  20. return(input)
  21. def prepare_dataset(df):
  22. df.rename({60: 'label'}, axis='columns', inplace=True)
  23. df['label'] = pd.Series(LabelEncoder().fit_transform(df['label'])).astype('category')
  24. # y = df['label'].copy(deep=True)
  25. # label_enc = LabelEncoder().fit(y)
  26. # y = label_enc.transform(y)
  27. # X = df.drop('label', axis=1).astype(float)
  28. # X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=split,
  29. # shuffle=True, random_state=random_seed)
  30. # df_train = pd.DataFrame(X_train).join(pd.Series(y_train, name='label'))
  31. # df_test = pd.DataFrame(X_test).join(pd.Series(y_test, name='label'))
  32. df_train, df_test = train_test_split(df, stratify=df['label'], test_size=split, shuffle=True, random_state=random_seed)
  33. return df_train, df_test
  34. if __name__ == "__main__":
  35. if len(sys.argv) != 2:
  36. sys.stderr.write('Arguments error. Usage:\n')
  37. sys.stderr.write('\tpython prepare.py data\n')
  38. sys.exit(1)
  39. mkdir_p(OUTPUT)
  40. input = preprocess(sys.argv[1])
  41. train, test = prepare_dataset(input)
  42. train.to_csv(os.path.join(OUTPUT, 'train.joblib'), index_label=False)
  43. test.to_csv(os.path.join(OUTPUT, 'test.joblib'), index_label=False)
  44. print('Saved train and test sets in ' + OUTPUT)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...