Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare.py 1.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  1. import io
  2. import sys
  3. import xml.etree.ElementTree
  4. import random
  5. import re
  6. import os
  7. import errno
  8. if len(sys.argv) != 2:
  9. sys.stderr.write('Arguments error. Usage:\n')
  10. sys.stderr.write('\tpython prepare.py data\n')
  11. sys.exit(1)
  12. # Test data set split ratio
  13. split = 0.20
  14. random.seed(20170426)
  15. input = sys.argv[1]
  16. output_train = os.path.join('data', 'prepared', 'train.tsv')
  17. output_test = os.path.join('data', 'prepared', 'test.tsv')
  18. try:
  19. reload(sys)
  20. sys.setdefaultencoding('utf-8')
  21. except NameError:
  22. pass
  23. def mkdir_p(path):
  24. try:
  25. os.makedirs(path)
  26. except OSError as exc: # Python >2.5
  27. if exc.errno == errno.EEXIST and os.path.isdir(path):
  28. pass
  29. else:
  30. raise
  31. def process_posts(fd_in, fd_out_train, fd_out_test, target_tag):
  32. num = 1
  33. for line in fd_in:
  34. try:
  35. fd_out = fd_out_train if random.random() > split else fd_out_test
  36. attr = xml.etree.ElementTree.fromstring(line).attrib
  37. pid = attr.get('Id', '')
  38. label = 1 if target_tag in attr.get('Tags', '') else 0
  39. title = re.sub('\s+', ' ', attr.get('Title', '')).strip()
  40. body = re.sub('\s+', ' ', attr.get('Body', '')).strip()
  41. text = title + ' ' + body
  42. fd_out.write(u'{}\t{}\t{}\n'.format(pid, label, text))
  43. num += 1
  44. except Exception as ex:
  45. sys.stderr.write('Skipping the broken line {}: {}\n'.format(num, ex))
  46. mkdir_p(os.path.join('data', 'prepared'))
  47. with io.open(input, encoding='utf8') as fd_in:
  48. with io.open(output_train, 'w', encoding='utf8') as fd_out_train:
  49. with io.open(output_test, 'w', encoding='utf8') as fd_out_test:
  50. process_posts(fd_in, fd_out_train, fd_out_test, u'<python>')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...