Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

split_train_test.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. import conf
  4. INPUT = conf.source_tsv
  5. def output_name(fname, suffix):
  6. idx = fname.rfind('.')
  7. if idx < 0:
  8. return fname
  9. i = len(fname) - idx
  10. return fname[:-i] + suffix + fname[-i:]
  11. import sys
  12. try: #python2
  13. reload(sys)
  14. sys.setdefaultencoding('utf-8')
  15. except: pass
  16. if len(sys.argv) != 3:
  17. sys.stderr.write('Arguments error. Usage:\n')
  18. sys.stderr.write('\tpython split_train_test.py TEST_RATIO SEED\n')
  19. sys.stderr.write('\t\tTEST_RATIO - train set ratio (double). Example: 0.3\n')
  20. sys.stderr.write('\t\tSEED - random state (integer). Example: 20170423\n')
  21. sys.exit(1)
  22. test_ratio = float(sys.argv[1])
  23. seed = int(sys.argv[2])
  24. train = conf.train_tsv
  25. test = conf.test_tsv
  26. df = pd.read_csv(
  27. INPUT,
  28. encoding='utf-8',
  29. header=None,
  30. delimiter='\t',
  31. names=['id', 'label', 'text']
  32. )
  33. df_positive = df[df['label'] == 1]
  34. df_negative = df[df['label'] == 0]
  35. sys.stderr.write('Positive size {}, negative size {}\n'.format(
  36. df_positive.shape[0],
  37. df_negative.shape[0]
  38. ))
  39. def sub_df_by_ids(df, ids):
  40. df_train_order = pd.DataFrame(data={'id': ids})
  41. return df.merge(df_train_order, on='id')
  42. def train_test_split_df(df, ids, test_ratio, seed):
  43. train_ids, test_ids = train_test_split(ids, test_size=test_ratio, random_state=seed)
  44. return sub_df_by_ids(df, train_ids), sub_df_by_ids(df, test_ids)
  45. df_pos_train, df_pos_test = train_test_split_df(df, df_positive.id, test_ratio, seed)
  46. df_neg_train, df_neg_test = train_test_split_df(df, df_negative.id, test_ratio, seed)
  47. df_train = pd.concat([df_pos_train, df_neg_train])
  48. df_test = pd.concat([df_pos_test, df_neg_test])
  49. df_train.to_csv(train, sep='\t', header=False, index=False)
  50. df_test.to_csv(test, sep='\t', header=False, index=False)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...