1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
- import sys
- from pathlib import Path
- import subprocess as sp
- import os
- from invoke import task
- data_dir = Path('data')
- tgt_dir = Path('target')
- bin_dir = tgt_dir / 'release'
- def pipeline(steps, outfile=None):
- last = sp.DEVNULL
- if outfile is not None:
- outfd = os.open(outfile, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o666)
- else:
- outfd = None
-
- procs = []
- for step in steps[:-1]:
- proc = sp.Popen(step, stdin=last, stdout=sp.PIPE)
- last = proc.stdout
- procs.append(proc)
-
- proc = sp.Popen(steps[-1], stdin=last, stdout=outfd)
- procs.append(proc)
- for p, s in zip(procs, steps):
- rc = p.wait()
- if rc != 0:
- print(f'{s[0]} exited with code {rc}', file=sys.stderr)
- raise RuntimeError('subprocess failed')
- @task
- def build(c, debug=False):
- "Compile the Rust support executables"
- global bin_dir
- if debug:
- c.run('cargo build')
- bin_dir = tgt_dir / 'debug'
- else:
- c.run('cargo build --release')
- @task(build)
- def convert_viaf(c, date='20181104', progress=True):
- infile = data_dir / f'viaf-{date}-clusters-marc21.xml.gz'
- outfile = data_dir / f'viaf-{date}-clusters.psql.gz'
- pipeline([
- ['pv', infile],
- ['gunzip'],
- [bin_dir / 'parse-marc'],
- ['gzip']
- ], outfile=outfile)
-
- @task(build)
- def convert_ol_authors(c, date='2018-10-31', progress=True):
- infile = data_dir / f'ol_dump_authors_{date}.txt.gz'
- outfile = data_dir / f'ol_dump_authors_{date}.psql.gz'
- pipeline([
- ['pv', infile],
- ['gunzip'],
- [bin_dir / 'clean-openlib'],
- ['gzip']
- ], outfile=outfile)
- @task(build)
- def convert_ol_editions(c, date='2018-10-31', progress=True):
- infile = data_dir / f'ol_dump_editions_{date}.txt.gz'
- outfile = data_dir / f'ol_dump_editions_{date}.psql.gz'
- pipeline([
- ['pv', infile],
- ['gunzip'],
- [bin_dir / 'clean-openlib'],
- ['gzip']
- ], outfile=outfile)
- @task(build)
- def convert_ol_works(c, date='2018-10-31', progress=True):
- infile = data_dir / f'ol_dump_works_{date}.txt.gz'
- outfile = data_dir / f'ol_dump_works_{date}.psql.gz'
- pipeline([
- ['pv', infile],
- ['gunzip'],
- [bin_dir / 'clean-openlib'],
- ['gzip']
- ], outfile=outfile)
- if __name__ == '__main__':
- import invoke.program
- program = invoke.program.Program()
- program.run()
|