Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

tasks.py 2.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
  1. import sys
  2. from pathlib import Path
  3. import subprocess as sp
  4. import os
  5. from invoke import task
  6. data_dir = Path('data')
  7. tgt_dir = Path('target')
  8. bin_dir = tgt_dir / 'release'
  9. def pipeline(steps, outfile=None):
  10. last = sp.DEVNULL
  11. if outfile is not None:
  12. outfd = os.open(outfile, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o666)
  13. else:
  14. outfd = None
  15. procs = []
  16. for step in steps[:-1]:
  17. proc = sp.Popen(step, stdin=last, stdout=sp.PIPE)
  18. last = proc.stdout
  19. procs.append(proc)
  20. proc = sp.Popen(steps[-1], stdin=last, stdout=outfd)
  21. procs.append(proc)
  22. for p, s in zip(procs, steps):
  23. rc = p.wait()
  24. if rc != 0:
  25. print(f'{s[0]} exited with code {rc}', file=sys.stderr)
  26. raise RuntimeError('subprocess failed')
  27. @task
  28. def build(c, debug=False):
  29. "Compile the Rust support executables"
  30. global bin_dir
  31. if debug:
  32. c.run('cargo build')
  33. bin_dir = tgt_dir / 'debug'
  34. else:
  35. c.run('cargo build --release')
  36. @task(build)
  37. def convert_viaf(c, date='20181104', progress=True):
  38. infile = data_dir / f'viaf-{date}-clusters-marc21.xml.gz'
  39. outfile = data_dir / f'viaf-{date}-clusters.psql.gz'
  40. pipeline([
  41. ['pv', infile],
  42. ['gunzip'],
  43. [bin_dir / 'parse-marc'],
  44. ['gzip']
  45. ], outfile=outfile)
  46. @task(build)
  47. def convert_ol_authors(c, date='2018-10-31', progress=True):
  48. infile = data_dir / f'ol_dump_authors_{date}.txt.gz'
  49. outfile = data_dir / f'ol_dump_authors_{date}.psql.gz'
  50. pipeline([
  51. ['pv', infile],
  52. ['gunzip'],
  53. [bin_dir / 'clean-openlib'],
  54. ['gzip']
  55. ], outfile=outfile)
  56. @task(build)
  57. def convert_ol_editions(c, date='2018-10-31', progress=True):
  58. infile = data_dir / f'ol_dump_editions_{date}.txt.gz'
  59. outfile = data_dir / f'ol_dump_editions_{date}.psql.gz'
  60. pipeline([
  61. ['pv', infile],
  62. ['gunzip'],
  63. [bin_dir / 'clean-openlib'],
  64. ['gzip']
  65. ], outfile=outfile)
  66. @task(build)
  67. def convert_ol_works(c, date='2018-10-31', progress=True):
  68. infile = data_dir / f'ol_dump_works_{date}.txt.gz'
  69. outfile = data_dir / f'ol_dump_works_{date}.psql.gz'
  70. pipeline([
  71. ['pv', infile],
  72. ['gunzip'],
  73. [bin_dir / 'clean-openlib'],
  74. ['gzip']
  75. ], outfile=outfile)
  76. if __name__ == '__main__':
  77. import invoke.program
  78. program = invoke.program.Program()
  79. program.run()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...