1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
schema: '2.0'
stages:
scan-interactions:
cmd: cargo run --release -- goodreads scan interactions ../data/goodreads/goodreads_interactions.json.gz
deps:
- path: ../data/goodreads/goodreads_interactions.json.gz
hash: md5
md5: f2d054a85f33d405a9bff6933005ba89
size: 9388113365
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: ../src/goodreads
hash: md5
md5: e8117b3c433c7fa4f7ffbaf7cb8d7f06.dir
size: 19592
nfiles: 9
outs:
- path: gr-interactions.parquet
hash: md5
md5: b30430653770266a99921696af2d4044
size: 4587488091
- path: gr-users.parquet
hash: md5
md5: ceb6314193b9a242df2a4db02e39f639
size: 18637038
scan-book-info:
cmd: cargo run --release -- goodreads scan books ../data/goodreads/goodreads_books.json.gz
deps:
- path: ../data/goodreads/goodreads_books.json.gz
md5: 01b40c70a00fb6aa321ee478f0fd0d6b
size: 2043729443
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: ../src/goodreads
hash: md5
md5: e8117b3c433c7fa4f7ffbaf7cb8d7f06.dir
size: 19592
nfiles: 9
outs:
- path: gr-book-authors.parquet
hash: md5
md5: 588ef6c68a1a1a3418dac0164da333f7
size: 21162981
- path: gr-book-ids.parquet
hash: md5
md5: 402aceb6914ba3453d230144b39fbfe8
size: 46635671
- path: gr-book-info.parquet
hash: md5
md5: e5dd7a6abf966ecb35abac621b234db2
size: 12358354
- path: gr-book-series.parquet
hash: md5
md5: 30a54ee855d2a14238c215d96b5e6625
size: 5857161
scan-work-info:
cmd: cargo run --release -- goodreads scan works ../data/goodreads/goodreads_book_works.json.gz
deps:
- path: ../data/goodreads/goodreads_book_works.json.gz
md5: e80738a88d02d2b0081cd249d9b4f081
size: 81412944
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: ../src/goodreads
hash: md5
md5: e8117b3c433c7fa4f7ffbaf7cb8d7f06.dir
size: 19592
nfiles: 9
outs:
- path: gr-work-info.parquet
hash: md5
md5: f67f4698e120af10eefae7ce4fb11498
size: 19901545
book-isbn-ids:
cmd: cargo run --release -- link-isbn-ids -o goodreads/book-isbn-ids.parquet -R
book_id -I isbn10 -I isbn13 -I asin goodreads/gr-book-ids.parquet
deps:
- path: book-links/all-isbns.parquet
hash: md5
md5: 8803c162ab97efac8b098df7e9252314
size: 464207536
- path: goodreads/gr-book-ids.parquet
hash: md5
md5: 402aceb6914ba3453d230144b39fbfe8
size: 46635671
- path: src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
outs:
- path: goodreads/book-isbn-ids.parquet
hash: md5
md5: 5c95445b6996decd1754750077214289
size: 16211736
cluster-ratings:
cmd: cargo run --release -- goodreads cluster-interactions --ratings -o goodreads/gr-cluster-ratings.parquet
deps:
- path: goodreads/gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
- path: goodreads/gr-interactions.parquet
hash: md5
md5: b30430653770266a99921696af2d4044
size: 4587488091
- path: src/cli/goodreads/cluster.rs
hash: md5
md5: 840dee408f740648113860ab5dbd3ec7
size: 6175
outs:
- path: goodreads/gr-cluster-ratings.parquet
hash: md5
md5: 0afc2f46e59c3f22f203c3c0759653c2
size: 1369545709
book-links:
cmd: cargo run --release -- cluster extract-books -o goodreads/gr-book-link.parquet
-n book_id --join-file goodreads/gr-book-ids.parquet --join-field work_id GR-B
deps:
- path: book-links/cluster-graph-nodes.parquet
hash: md5
md5: 2f0e64cd13b40c850326a75d4e69731a
size: 1226145163
- path: goodreads/gr-book-ids.parquet
hash: md5
md5: 402aceb6914ba3453d230144b39fbfe8
size: 46635671
outs:
- path: goodreads/gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
cluster-interactions:
cmd: python cluster-ratings.py
deps:
- path: cluster-ratings.py
md5: d44be03784268f02b28c1b40bc108c27
size: 1219
- path: gr-book-link.parquet
md5: f31fc29c8644c439e192af9868604a93
size: 29380146
- path: gr-interactions.parquet
md5: 0ee400ec374aa0263198b25e9d9140e0
size: 1619196241
outs:
- path: gr-cluster-ratings.parquet
md5: af8b42db6e3b8f0ded896f7cab433530
size: 327784394
cluster-actions:
cmd: cargo run --release -- goodreads cluster-interactions --add-actions -o goodreads/gr-cluster-actions.parquet
deps:
- path: goodreads/gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
- path: goodreads/gr-interactions.parquet
hash: md5
md5: b30430653770266a99921696af2d4044
size: 4587488091
- path: src/cli/goodreads/cluster.rs
hash: md5
md5: 840dee408f740648113860ab5dbd3ec7
size: 6175
outs:
- path: goodreads/gr-cluster-actions.parquet
hash: md5
md5: 6f3181c11589740851b49364f7140793
size: 2709707272
work-ratings:
cmd: cargo run --release -- goodreads cluster-interactions --ratings --native-works
-o goodreads/gr-work-ratings.parquet
deps:
- path: goodreads/gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
- path: goodreads/gr-interactions.parquet
hash: md5
md5: b30430653770266a99921696af2d4044
size: 4587488091
- path: src/cli/goodreads/cluster.rs
hash: md5
md5: 840dee408f740648113860ab5dbd3ec7
size: 6175
outs:
- path: goodreads/gr-work-ratings.parquet
hash: md5
md5: f4d72ec57c57ac3d013b7271c79d950b
size: 1434246294
work-actions:
cmd: cargo run --release -- goodreads cluster-interactions --add-actions --native-works
-o goodreads/gr-work-actions.parquet
deps:
- path: goodreads/gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
- path: goodreads/gr-interactions.parquet
hash: md5
md5: b30430653770266a99921696af2d4044
size: 4587488091
- path: src/cli/goodreads/cluster.rs
hash: md5
md5: 840dee408f740648113860ab5dbd3ec7
size: 6175
outs:
- path: goodreads/gr-work-actions.parquet
hash: md5
md5: d20518227c34e9b440ba5f39e85da954
size: 2808086719
work-gender:
cmd: cargo run --release -- goodreads work-gender
deps:
- path: ../book-links/cluster-genders.parquet
hash: md5
md5: 5aef997fa371322d26041bbde5642c0d
size: 180340846
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
outs:
- path: gr-work-gender.parquet
hash: md5
md5: ae8f42aedc5976b2e7ba17ad31c4d615
size: 27992752
schema@gr-work-gender:
cmd: python ../run.py --rust pq-info -o gr-work-gender.json gr-work-gender.parquet
deps:
- path: gr-work-gender.parquet
md5: 45e9d912b392b0d714f67c126531bbae
size: 28891449
outs:
- path: gr-work-gender.json
md5: 4c896ebcdce4495982076d1150ea9e5a
size: 423
schema@gr-work-info:
cmd: python ../run.py --rust pq-info -o gr-work-info.json gr-work-info.parquet
deps:
- path: gr-work-info.parquet
md5: 79b363824af58bff7fa61e645bbe23b1
size: 21837297
outs:
- path: gr-work-info.json
md5: 833d0d78ade406bdde4f310c619c7c13
size: 517
schema@gr-work-actions:
cmd: python ../run.py --rust pq-info -o gr-work-actions.json gr-work-actions.parquet
deps:
- path: gr-work-actions.parquet
md5: a15faa45f6b956f93a7795b453d946e8
size: 1569601498
outs:
- path: gr-work-actions.json
md5: fbd53ff95cd17f34166e93c8b039cb07
size: 618
schema@gr-work-ratings:
cmd: python ../run.py --rust pq-info -o gr-work-ratings.json gr-work-ratings.parquet
deps:
- path: gr-work-ratings.parquet
md5: 0d9f519acad4d8f1b94dafc54bace4a3
size: 1640676645
outs:
- path: gr-work-ratings.json
md5: 77c0c065962ddeee0d183dae1ac1e897
size: 708
schema@gr-book-info:
cmd: python ../run.py --rust pq-info -o gr-book-info.json gr-book-info.parquet
deps:
- path: gr-book-info.parquet
md5: a4344a4f10eb8631049a5d2f4cf91e7f
size: 15146558
outs:
- path: gr-book-info.json
md5: 8cc501ca40ea549e8d27d948b3ff0231
size: 518
schema@book-isbn-ids:
cmd: python ../run.py --rust pq-info -o book-isbn-ids.json book-isbn-ids.parquet
deps:
- path: book-isbn-ids.parquet
md5: 100ea102d2775993cc40522df7210687
size: 15490623
outs:
- path: book-isbn-ids.json
md5: 771436a0047e0443854ba36d95411b20
size: 249
schema@gr-interactions:
cmd: python ../run.py --rust pq-info -o gr-interactions.json gr-interactions.parquet
deps:
- path: gr-interactions.parquet
md5: 9788655b2499eb8150398f8c1558e823
size: 4372045343
outs:
- path: gr-interactions.json
md5: 9b3af7db2bcd5cb7616d6f0a17daefb4
size: 990
schema@gr-book-ids:
cmd: python ../run.py --rust pq-info -o gr-book-ids.json gr-book-ids.parquet
deps:
- path: gr-book-ids.parquet
md5: e259dab35fc7e37e8904bc2584245138
size: 37497686
outs:
- path: gr-book-ids.json
md5: 130b3c6d3d145ee97264041a3637ef7e
size: 507
schema@gr-cluster-ratings:
cmd: python ../run.py --rust pq-info -o gr-cluster-ratings.json gr-cluster-ratings.parquet
deps:
- path: gr-cluster-ratings.parquet
md5: 353cd5edc3df644c77fe01da8f26a436
size: 1593822465
outs:
- path: gr-cluster-ratings.json
md5: 6a5a2ef9fa4305c24fd17082f9d976e4
size: 707
schema@gr-users:
cmd: python ../run.py --rust pq-info -o gr-users.json gr-users.parquet
deps:
- path: gr-users.parquet
md5: ff0d06650a9944bf80cc8c4a99827c1d
size: 18683237
outs:
- path: gr-users.json
md5: 70a463f659828f45949b5288557788f7
size: 244
schema@gr-book-link:
cmd: python ../run.py --rust pq-info -o gr-book-link.json gr-book-link.parquet
deps:
- path: gr-book-link.parquet
md5: 17c43318b93c22563377f71b2ee45a1e
size: 19787709
outs:
- path: gr-book-link.json
md5: e6406cbc13eae52d6b605604c709a4af
size: 338
schema@gr-cluster-actions:
cmd: python ../run.py --rust pq-info -o gr-cluster-actions.json gr-cluster-actions.parquet
deps:
- path: gr-cluster-actions.parquet
md5: a93581e4cc78bf216b34358417954393
size: 1525089046
outs:
- path: gr-cluster-actions.json
md5: 6929df3364058a637d1aa4df32ba5567
size: 617
scan-book-genres:
cmd: cargo run --release -- goodreads scan genres ../data/goodreads/goodreads_book_genres_initial.json.gz
deps:
- path: ../data/goodreads/goodreads_book_genres_initial.json.gz
md5: 99ee3d1cadd68818c3dd0ef0d2f10602
size: 24253992
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: ../src/goodreads
hash: md5
md5: e8117b3c433c7fa4f7ffbaf7cb8d7f06.dir
size: 19592
nfiles: 9
outs:
- path: gr-book-genres.parquet
hash: md5
md5: cb1b59dbac1ccef2acd1ac225818b629
size: 19187256
- path: gr-genres.parquet
hash: md5
md5: c24a0e55e1bc79258188aac5da3e1ac3
size: 809
schema@gr-genres:
cmd: python ../run.py --rust pq-info -o gr-genres.json gr-genres.parquet
deps:
- path: gr-genres.parquet
md5: c24a0e55e1bc79258188aac5da3e1ac3
size: 809
outs:
- path: gr-genres.json
md5: 8df384efb85a16440ee09f14a3da1771
size: 235
schema@gr-book-series:
cmd: python ../run.py --rust pq-info -o gr-book-series.json gr-book-series.parquet
deps:
- path: gr-book-series.parquet
md5: 6688b051bf7a7b3ed720a49ba74a528c
size: 5654585
outs:
- path: gr-book-series.json
md5: 02b8f57e20dbdddb8fc883fc03ef0561
size: 245
schema@gr-book-genres:
cmd: python ../run.py --rust pq-info -o gr-book-genres.json gr-book-genres.parquet
deps:
- path: gr-book-genres.parquet
md5: 96e51ae7b7e09f9d752b110306bc8dd1
size: 17278459
outs:
- path: gr-book-genres.json
md5: 0a9f5acaf9bc9c79b6c2792f311f9889
size: 338
scan-simple-interactions:
cmd: python ../run.py --rust goodreads scan interactions --csv --book-map ../data/goodreads/book_id_map.csv
../data/goodreads/goodreads_interactions.csv
deps:
- path: ../data/goodreads/book_id_map.csv
md5: c4e5afd568df2f7a4a8a52f3eeb88413
size: 37846957
- path: ../data/goodreads/goodreads_interactions.csv
md5: 696fbf71f0082c0b6a2379182b147c1e
size: 4318621741
- path: ../src/cli/goodreads.rs
md5: 1fe05e7e29045b7ad1528df9af270c2d
size: 3080
- path: ../src/goodreads
md5: 2a97b45388d5581a7db8e442cba294fb.dir
size: 13762
nfiles: 6
outs:
- path: gr-simple-interactions.parquet
md5: e01dd1692896c9ae0a2b18e94b94e5aa
size: 1456440955
cluster-simple-ratings:
cmd: python ../run.py gr-cluster-interactions.py --ratings --simple -o gr-cluster-simple-ratings.parquet
deps:
- path: gr-book-link.parquet
md5: a8fbe0288a2682a983fe9550e500ad93
size: 20310729
- path: gr-cluster-interactions.py
md5: f3bff4368de9ccfc6a9d92f9787eceb8
size: 4159
- path: gr-simple-interactions.parquet
md5: e01dd1692896c9ae0a2b18e94b94e5aa
size: 1456440955
outs:
- path: gr-cluster-simple-ratings.parquet
md5: 0675244e9a9ba0d451bbe12dbcb3cbe4
size: 689950939
scan-author-info:
cmd: cargo run --release -- goodreads scan authors ../data/goodreads/goodreads_book_authors.json.gz
deps:
- path: ../data/goodreads/goodreads_book_authors.json.gz
md5: b193c3febd961fb69443b65ba05b83a7
size: 17877585
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: ../src/goodreads
hash: md5
md5: e8117b3c433c7fa4f7ffbaf7cb8d7f06.dir
size: 19592
nfiles: 9
outs:
- path: gr-author-info.parquet
hash: md5
md5: cf52195249be80735cd05e396fd749bf
size: 10149282
schema@gr-author-info:
cmd: python ../run.py --rust pq-info -o gr-author-info.json gr-author-info.parquet
deps:
- path: gr-author-info.parquet
md5: f91028921cc88b670cdcfddc8f66d23a
size: 10031105
outs:
- path: gr-author-info.json
md5: d60cc12c1bab7ad51515067d976ff3d1
size: 245
work-actions-5core:
cmd: cargo run --release -- kcore -o gr-work-actions-5core.parquet gr-work-actions.parquet
deps:
- path: ../src/cli/kcore.rs
hash: md5
md5: 9a64f2beb19d2053d9c2386609beafe9
size: 4874
- path: gr-work-actions.parquet
hash: md5
md5: d20518227c34e9b440ba5f39e85da954
size: 2808086719
outs:
- path: gr-work-actions-5core.parquet
hash: md5
md5: 9304c4ed695ecd786ccdbe67d5557eb2
size: 2793734157
cluster-ratings-5core:
cmd: cargo run --release -- kcore -o gr-cluster-ratings-5core.parquet gr-cluster-ratings.parquet
deps:
- path: ../src/cli/kcore.rs
hash: md5
md5: 9a64f2beb19d2053d9c2386609beafe9
size: 4874
- path: gr-cluster-ratings.parquet
hash: md5
md5: 0afc2f46e59c3f22f203c3c0759653c2
size: 1369545709
outs:
- path: gr-cluster-ratings-5core.parquet
hash: md5
md5: 3ae528be03c73d8974d4f0c51f3b698a
size: 1347470376
cluster-actions-5core:
cmd: cargo run --release -- kcore -o gr-cluster-actions-5core.parquet gr-cluster-actions.parquet
deps:
- path: ../src/cli/kcore.rs
hash: md5
md5: 9a64f2beb19d2053d9c2386609beafe9
size: 4874
- path: gr-cluster-actions.parquet
hash: md5
md5: 6f3181c11589740851b49364f7140793
size: 2709707272
outs:
- path: gr-cluster-actions-5core.parquet
hash: md5
md5: b407bad4df6930b6630427d138543020
size: 2694955704
work-ratings-5core:
cmd: cargo run --release -- kcore -o gr-work-ratings-5core.parquet gr-work-ratings.parquet
deps:
- path: ../src/cli/kcore.rs
hash: md5
md5: 9a64f2beb19d2053d9c2386609beafe9
size: 4874
- path: gr-work-ratings.parquet
hash: md5
md5: f4d72ec57c57ac3d013b7271c79d950b
size: 1434246294
outs:
- path: gr-work-ratings-5core.parquet
hash: md5
md5: e3433e12926d01b7c49f4f32fb603b83
size: 1412366085
work-actions-2015-100-10core:
cmd: cargo run --release -- kcore --user-k 10 --item-k 100 --year 2015 -o gr-work-actions-2015-100-10core.parquet
gr-work-actions.parquet
deps:
- path: ../src/cli/kcore.rs
hash: md5
md5: 9a64f2beb19d2053d9c2386609beafe9
size: 4874
- path: gr-work-actions.parquet
hash: md5
md5: d20518227c34e9b440ba5f39e85da954
size: 2808086719
outs:
- path: gr-work-actions-2015-100-10core.parquet
hash: md5
md5: aa0c49abdfb8887fb7361b0cbff1ce34
size: 177294387
work-ratings-2015-100-10core:
cmd: cargo run --release -- kcore --user-k 10 --item-k 100 --year 2015 -o gr-work-ratings-2015-100-10core.parquet
gr-work-ratings.parquet
deps:
- path: ../src/cli/kcore.rs
hash: md5
md5: 9a64f2beb19d2053d9c2386609beafe9
size: 4874
- path: gr-work-ratings.parquet
hash: md5
md5: f4d72ec57c57ac3d013b7271c79d950b
size: 1434246294
outs:
- path: gr-work-ratings-2015-100-10core.parquet
hash: md5
md5: b4d5bb0ae60646e8a1dd04cf57c47ab1
size: 5153391
scan-reviews:
cmd: cargo run --release -- goodreads scan reviews ../data/goodreads/goodreads_reviews_dedup.json.gz
deps:
- path: ../data/goodreads/goodreads_reviews_dedup.json.gz
hash: md5
md5: bdd95c4f92691df3d311012254988a1e
size: 5343299228
- path: ../src/cli/goodreads
hash: md5
md5: 02847887f789f5b5f5672f8768f95b05.dir
size: 10485
nfiles: 4
- path: ../src/goodreads
hash: md5
md5: e8117b3c433c7fa4f7ffbaf7cb8d7f06.dir
size: 19592
nfiles: 9
- path: gr-book-link.parquet
hash: md5
md5: 8e85c40384efe8936830b3d92dc59b9a
size: 19973611
- path: gr-users.parquet
hash: md5
md5: ceb6314193b9a242df2a4db02e39f639
size: 18637038
outs:
- path: gr-reviews.parquet
hash: md5
md5: c4fc1913cb2b332f1709f737086346a1
size: 4513330717
Tip!
Press p or to see the previous file or,
n or to see the next file