1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
- local bd = import '../lib.jsonnet';
- bd.pipeline({
- 'collect-isbns': {
- cmd: bd.cmd('collect-isbns -o all-isbns.parquet'),
- deps: std.prune([
- '../config.yaml',
- '../src/cli/collect_isbns.rs',
- '../loc-mds/book-isbns.parquet',
- '../openlibrary/edition-isbns.parquet',
- bd.maybe(bd.config.goodreads.enabled, '../goodreads/gr-book-ids.parquet'),
- bd.maybe(bd.config.bx.enabled, '../bx/cleaned-ratings.csv'),
- bd.maybe(bd.config.az2014.enabled, '../az2014/ratings.parquet'),
- bd.maybe(bd.config.az2018.enabled, '../az2018/ratings.parquet'),
- ]),
- outs: [
- 'all-isbns.parquet',
- ],
- },
- cluster: {
- wdir: '..',
- cmd: bd.cmd('cluster-books --save-graph book-links/book-graph.mp.zst'),
- deps: [
- 'src/cli/cluster_books.rs',
- 'src/graph/',
- 'book-links/all-isbns.parquet',
- 'loc-mds/book-ids.parquet',
- 'loc-mds/book-isbn-ids.parquet',
- 'openlibrary/editions.parquet',
- 'openlibrary/edition-isbn-ids.parquet',
- 'openlibrary/all-works.parquet',
- 'openlibrary/edition-works.parquet',
- ] + if bd.config.goodreads.enabled then [
- 'goodreads/gr-book-ids.parquet',
- 'goodreads/book-isbn-ids.parquet',
- ] else [],
- outs: [
- 'book-links/book-graph.mp.zst',
- 'book-links/isbn-clusters.parquet',
- 'book-links/cluster-stats.parquet',
- 'book-links/cluster-graph-nodes.parquet',
- 'book-links/cluster-graph-edges.parquet',
- ],
- metrics: [
- 'book-links/cluster-metrics.json',
- ],
- },
- 'cluster-ol-first-authors': {
- wdir: '..',
- cmd: bd.cmd('cluster extract-authors -o book-links/cluster-ol-first-authors.parquet --first-author -s openlib'),
- deps: [
- 'src/cli/cluster',
- 'book-links/isbn-clusters.parquet',
- 'openlibrary/edition-isbn-ids.parquet',
- 'openlibrary/edition-authors.parquet',
- 'openlibrary/author-names.parquet',
- ],
- outs: [
- 'book-links/cluster-ol-first-authors.parquet',
- ],
- },
- 'cluster-loc-first-authors': {
- wdir: '..',
- cmd: bd.cmd('cluster extract-authors -o book-links/cluster-loc-first-authors.parquet --first-author -s loc'),
- deps: [
- 'src/cli/cluster',
- 'book-links/isbn-clusters.parquet',
- 'loc-mds/book-isbn-ids.parquet',
- 'loc-mds/book-authors.parquet',
- ],
- outs: [
- 'book-links/cluster-loc-first-authors.parquet',
- ],
- },
- 'cluster-first-authors': {
- wdir: '..',
- cmd: bd.cmd('cluster extract-authors -o book-links/cluster-first-authors.parquet --first-author -s openlib -s loc'),
- deps: [
- 'src/cli/cluster',
- 'book-links/isbn-clusters.parquet',
- 'openlibrary/edition-isbn-ids.parquet',
- 'openlibrary/edition-authors.parquet',
- 'openlibrary/author-names.parquet',
- 'loc-mds/book-isbn-ids.parquet',
- 'loc-mds/book-authors.parquet',
- ],
- outs: [
- 'book-links/cluster-first-authors.parquet',
- ],
- },
- 'cluster-genders': {
- wdir: '..',
- cmd: bd.cmd('cluster extract-author-gender -o book-links/cluster-genders.parquet -A book-links/cluster-first-authors.parquet'),
- deps: [
- 'src/cli/cluster',
- 'book-links/cluster-stats.parquet',
- 'book-links/cluster-first-authors.parquet',
- 'viaf/author-name-index.parquet',
- 'viaf/author-genders.parquet',
- ],
- outs: [
- 'book-links/cluster-genders.parquet',
- ],
- },
- 'gender-stats': {
- wdir: '..',
- cmd: bd.cmd('integration-stats'),
- deps: std.prune([
- 'src/cli/stats.rs',
- 'book-links/cluster-genders.parquet',
- 'book-links/isbn-clusters.parquet',
- 'loc-mds/book-isbn-ids.parquet',
- bd.maybe(bd.config.bx.enabled, 'bx/bx-cluster-actions.parquet'),
- bd.maybe(bd.config.bx.enabled, 'bx/bx-cluster-ratings.parquet'),
- bd.maybe(bd.config.az2014.enabled, 'az2014/az-cluster-ratings.parquet'),
- bd.maybe(bd.config.az2018.enabled, 'az2018/az-cluster-ratings.parquet'),
- bd.maybe(bd.config.goodreads.enabled, 'goodreads/gr-cluster-actions.parquet'),
- bd.maybe(bd.config.goodreads.enabled, 'goodreads/gr-cluster-ratings.parquet'),
- ]),
- outs: [
- 'book-links/gender-stats.csv',
- ],
- },
- 'cluster-hashes': {
- cmd: bd.cmd('cluster hash -o cluster-hashes.parquet isbn-clusters.parquet'),
- deps: [
- '../src/cli/cluster/hash.rs',
- 'isbn-clusters.parquet',
- ],
- outs: [
- 'cluster-hashes.parquet',
- ],
- },
- })
|