Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
  1. import io
  2. import sys
  3. import xml.etree.ElementTree
  4. import random
  5. import re
  6. import os
  7. import yaml
  8. params = yaml.safe_load(open('params.yaml'))['prepare']
  9. if len(sys.argv) != 2:
  10. sys.stderr.write("Arguments error. Usage:\n")
  11. sys.stderr.write("\tpython prepare.py data-file\n")
  12. sys.exit(1)
  13. # Test data set split ratio
  14. split = params['split']
  15. random.seed(params['seed'])
  16. input = sys.argv[1]
  17. output_train = os.path.join('data', 'prepared', 'train.tsv')
  18. output_test = os.path.join('data', 'prepared', 'test.tsv')
  19. def process_posts(fd_in, fd_out_train, fd_out_test, target_tag):
  20. num = 1
  21. for line in fd_in:
  22. try:
  23. fd_out = fd_out_train if random.random() > split else fd_out_test
  24. attr = xml.etree.ElementTree.fromstring(line).attrib
  25. pid = attr.get('Id', '')
  26. label = 1 if target_tag in attr.get('Tags', '') else 0
  27. title = re.sub(r'\s+', ' ', attr.get('Title', '')).strip()
  28. body = re.sub(r'\s+', ' ', attr.get('Body', '')).strip()
  29. text = title + ' ' + body
  30. fd_out.write(u'{}\t{}\t{}\n'.format(pid, label, text))
  31. num += 1
  32. except Exception as ex:
  33. sys.stderr.write(f'Skipping the broken line {num}: {ex}\n')
  34. os.makedirs(os.path.join('data', 'prepared'), exist_ok=True)
  35. with io.open(input, encoding='utf8') as fd_in:
  36. with io.open(output_train, 'w', encoding='utf8') as fd_out_train:
  37. with io.open(output_test, 'w', encoding='utf8') as fd_out_test:
  38. process_posts(fd_in, fd_out_train, fd_out_test, '<python>')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...