Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

xml_to_tsv.py 1.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
  1. import sys
  2. import os
  3. import re
  4. import xml.etree.ElementTree
  5. import conf
  6. INPUT = conf.source_xml
  7. OUTPUT = conf.source_tsv
  8. TAG = 'python'
  9. try: #python2
  10. reload(sys)
  11. sys.setdefaultencoding('utf-8')
  12. except: pass
  13. def print_usage(msg):
  14. if msg:
  15. sys.stderr.write('{}\n'.format(msg))
  16. sys.stderr.write('Usage:\n')
  17. sys.stderr.write('\tpython posts_to_tsv.py\n')
  18. def process_posts(fd_in, fd_out, target_tag):
  19. num = 1
  20. for line in fd_in:
  21. try:
  22. attr = xml.etree.ElementTree.fromstring(line).attrib
  23. id = attr.get('Id', '')
  24. label = 1 if target_tag in attr.get('Tags', '') else 0
  25. title = attr.get('Title', '').replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
  26. body = attr.get('Body', '').replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
  27. text = title + ' ' + body
  28. fd_out.write(u'{}\t{}\t{}\n'.format(id, label, text))
  29. num += 1
  30. except Exception as ex:
  31. sys.stderr.write('Error in line {}: {}\n'.format(num, ex))
  32. if __name__ == '__main__':
  33. target_tag = u'<' + TAG + '>'
  34. if not os.path.exists(INPUT):
  35. print_usage('Input file {} does not exist'.format(INPUT))
  36. sys.exit(1)
  37. with open(INPUT) as fd_in:
  38. with open(OUTPUT, 'w') as fd_out:
  39. process_posts(fd_in, fd_out, target_tag)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...