Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

xml_to_tsv.py 2.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
  1. """
  2. Transform XML data file to a TSV file.
  3. Routine Listings
  4. ----------------
  5. get_params()
  6. Get the DVC stage parameters.
  7. process_xml_to_tsv(input_path, output_path)
  8. Load and process XML file and save the data to TSV file.
  9. """
  10. import sys
  11. import os
  12. import dask
  13. import dask.distributed
  14. import xml.etree.ElementTree
  15. import conf
  16. def get_params():
  17. """Get the DVC stage parameters."""
  18. return {}
  19. @dask.delayed
  20. def process_xml_to_tsv(input_path, output_path):
  21. """Load and process XML file and save the data to TSV file."""
  22. TAG = 'python'
  23. target_tag = u'<' + TAG + '>'
  24. if not os.path.exists(input_path):
  25. sys.stderr.write(f'Input file {input_path} does not exist')
  26. sys.stderr.write('Usage:\n')
  27. sys.stderr.write('\tpython posts_to_tsv.py\n')
  28. sys.exit(1)
  29. with open(input_path) as fd_in:
  30. with open(output_path, 'w') as fd_out:
  31. num = 1
  32. for line in fd_in:
  33. try:
  34. attr = xml.etree.ElementTree.fromstring(line).attrib
  35. id = attr.get('Id', '')
  36. label = 1 if target_tag in attr.get('Tags', '') else 0
  37. title = attr.get('Title', '').replace('\t', ' ').replace(
  38. '\n', ' ').replace('\r', ' ')
  39. body = attr.get('Body', '').replace('\t', ' ').replace(
  40. '\n', ' ').replace('\r', ' ')
  41. text = title + ' ' + body
  42. fd_out.write(u'{}\t{}\t{}\n'.format(id, label, text))
  43. num += 1
  44. except Exception as ex:
  45. sys.stderr.write('Error in line {}: {}\n'.format(num, ex))
  46. if __name__ == '__main__':
  47. client = dask.distributed.Client('localhost:8786')
  48. dvc_stage_name = __file__.strip('.py')
  49. INPUT_DATASET_XML_PATH = conf.data_dir/'download_xml'/'Posts.xml'
  50. STAGE_OUTPUT_PATH = conf.data_dir/dvc_stage_name
  51. conf.remote_mkdir(STAGE_OUTPUT_PATH).compute()
  52. OUTPUT_DATASET_TSV_PATH = STAGE_OUTPUT_PATH/'Posts.tsv'
  53. process_xml_to_tsv(
  54. INPUT_DATASET_XML_PATH, OUTPUT_DATASET_TSV_PATH).compute()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...