Browse Source

working exports

Michael Ekstrand 2 months ago
parent
commit
8da85ee8ff
7 changed files with 190 additions and 34 deletions
  1. 1
    0
      environment.yml
  2. 7
    0
      export/.gitignore
  3. 71
    10
      export/dvc.lock
  4. 31
    0
      export/dvc.yaml
  5. 2
    2
      index/dvc.lock
  6. 9
    5
      index/gr-book-authors.sql
  7. 69
    17
      scripts/export-goodreads.py

+ 1
- 0
environment.yml

@@ -9,6 +9,7 @@ dependencies:
 - more-itertools
 - psycopg2
 - pandas
+- pyarrow
 - numpy
 - tqdm
 - colorama

+ 7
- 0
export/.gitignore

@@ -1,3 +1,10 @@
 /gr-book-ids.csv.gz
 /gr-book-ids.parquet
 /gr-work-ratings.parquet
+/gr-work-titles.parquet
+/gr-work-actions.parquet
+/gr-work-titles.csv.gz
+/gr-work-authors.parquet
+/gr-work-authors.csv.gz
+/gr-work-genres.parquet
+/gr-work-genres.csv.gz

+ 71
- 10
export/dvc.lock

@@ -7,15 +7,15 @@ stages:
       md5: 3fe0d48fd17efd5e670f6b2649fc2286
       size: 357
     - path: ../scripts/export-goodreads.py
-      md5: 057af6ab6c6f122db0c351f1b5fe77c8
-      size: 3438
+      md5: 40346af4504f271031f62b711d2af9e7
+      size: 4992
     outs:
     - path: gr-book-ids.csv.gz
-      md5: 84f072c02f669ae9aca3f754957d787f
-      size: 23739581
+      md5: 98cafc7e2f0ce3453928dbfd067be815
+      size: 23739591
     - path: gr-book-ids.parquet
-      md5: 5f0c7ea28a747530ed19543d4d58c4b5
-      size: 22506875
+      md5: aa8cb68d664c2ff5a5438194a0d5db60
+      size: 33901322
   gr-work-ratings:
     cmd: python ../run.py export-goodreads --work-ratings
     deps:
@@ -23,9 +23,70 @@ stages:
       md5: 2e5bab21dcd1a3ad70c7b9aa33e1f929
       size: 393
     - path: ../scripts/export-goodreads.py
-      md5: 057af6ab6c6f122db0c351f1b5fe77c8
-      size: 3438
+      md5: 40346af4504f271031f62b711d2af9e7
+      size: 4992
     outs:
     - path: gr-work-ratings.parquet
-      md5: f0705ef831eae32ee7b616bcb4817c11
-      size: 701939313
+      md5: 9d661dd551007db12d9b78392546a58c
+      size: 694490180
+  gr-work-titles:
+    cmd: python ../run.py export-goodreads --work-titles
+    deps:
+    - path: ../index/gr-book-info.status
+      md5: b6d909057e78a11672d579e61c18df0a
+      size: 553
+    - path: ../scripts/export-goodreads.py
+      md5: 40346af4504f271031f62b711d2af9e7
+      size: 4992
+    outs:
+    - path: gr-work-titles.csv.gz
+      md5: 18e4edc24dc1e690926ead2c3fc995fd
+      size: 23427453
+    - path: gr-work-titles.parquet
+      md5: 3309955fa43ba25559055e4080a696d4
+      size: 34273011
+  gr-work-actions:
+    cmd: python ../run.py export-goodreads --work-actions
+    deps:
+    - path: ../index/gr-index-ratings.status
+      md5: 2e5bab21dcd1a3ad70c7b9aa33e1f929
+      size: 393
+    - path: ../scripts/export-goodreads.py
+      md5: 40346af4504f271031f62b711d2af9e7
+      size: 4992
+    outs:
+    - path: gr-work-actions.parquet
+      md5: 5a36d44364960b141333c764f1e65f13
+      size: 1531608180
+  gr-work-authors:
+    cmd: python ../run.py export-goodreads --work-authors
+    deps:
+    - path: ../index/gr-book-info.status
+      md5: b6d909057e78a11672d579e61c18df0a
+      size: 553
+    - path: ../scripts/export-goodreads.py
+      md5: 40346af4504f271031f62b711d2af9e7
+      size: 4992
+    outs:
+    - path: gr-work-authors.csv.gz
+      md5: cfcf231da4dc10264e4a61cf68c7438e
+      size: 24822806
+    - path: gr-work-authors.parquet
+      md5: 13ca61d99fdc6552a6ef68dfaed1e168
+      size: 19153803
+  gr-work-genres:
+    cmd: python ../run.py export-goodreads --work-genres
+    deps:
+    - path: ../index/gr-book-info.status
+      md5: b6d909057e78a11672d579e61c18df0a
+      size: 553
+    - path: ../scripts/export-goodreads.py
+      md5: 40346af4504f271031f62b711d2af9e7
+      size: 4992
+    outs:
+    - path: gr-work-genres.csv.gz
+      md5: 755b1c2277180b27cdcc43a8bb6f799d
+      size: 10831285
+    - path: gr-work-genres.parquet
+      md5: 698d4b26a866baddf372cf9df6b11d92
+      size: 5629455

+ 31
- 0
export/dvc.yaml

@@ -14,3 +14,34 @@ stages:
     - ../scripts/export-goodreads.py
     outs:
     - gr-work-ratings.parquet
+  gr-work-actions:
+    cmd: python ../run.py export-goodreads --work-actions
+    deps:
+    - ../index/gr-index-ratings.status
+    - ../scripts/export-goodreads.py
+    outs:
+    - gr-work-actions.parquet
+  gr-work-titles:
+    cmd: python ../run.py export-goodreads --work-titles
+    deps:
+    - ../index/gr-book-info.status
+    - ../scripts/export-goodreads.py
+    outs:
+    - gr-work-titles.parquet
+    - gr-work-titles.csv.gz
+  gr-work-authors:
+    cmd: python ../run.py export-goodreads --work-authors
+    deps:
+    - ../index/gr-book-info.status
+    - ../scripts/export-goodreads.py
+    outs:
+    - gr-work-authors.parquet
+    - gr-work-authors.csv.gz
+  gr-work-genres:
+    cmd: python ../run.py export-goodreads --work-genres
+    deps:
+    - ../index/gr-book-info.status
+    - ../scripts/export-goodreads.py
+    outs:
+    - gr-work-genres.parquet
+    - gr-work-genres.csv.gz

+ 2
- 2
index/dvc.lock

@@ -46,8 +46,8 @@ stages:
       md5: abac8ffbe1d4b0e33b39320bdfd7974d
       size: 51
     - path: gr-book-authors.sql
-      md5: f007063c0371e5881b2e2663668b2c0c
-      size: 1305
+      md5: e8d7c2ce3d2cc997506e1302400df8db
+      size: 1436
     - path: gr-index-books.status
       md5: 3fe0d48fd17efd5e670f6b2649fc2286
       size: 357

+ 9
- 5
index/gr-book-authors.sql

@@ -2,7 +2,7 @@
 --- #dep gr-works
 --- #dep gr-authors
 --- #table gr.book_first_author
---- #table gr.author_ids
+--- #table gr.author_info
 
 --- #step Extract book first authors
 CREATE MATERIALIZED VIEW IF NOT EXISTS gr.book_first_author AS
@@ -11,6 +11,7 @@ FROM gr.raw_book, jsonb_to_record(gr_book_data->'authors'->0) AS
     x(role VARCHAR, author_id INTEGER);
 CREATE INDEX IF NOT EXISTS gr_bfa_book_idx ON gr.book_first_author (gr_book_rid);
 CREATE INDEX IF NOT EXISTS gr_bfa_auth_idx ON gr.book_first_author (gr_author_id);
+ANALYZE gr.book_first_author;
 
 --- #step Extract book authors
 CREATE MATERIALIZED VIEW IF NOT EXISTS gr.book_authors AS
@@ -19,10 +20,13 @@ FROM gr.raw_book, jsonb_to_recordset(gr_book_data->'authors') AS
     x(role VARCHAR, author_id INTEGER);
 CREATE INDEX IF NOT EXISTS gr_ba_book_idx ON gr.book_authors (gr_book_rid);
 CREATE INDEX IF NOT EXISTS gr_ba_auth_idx ON gr.book_authors (gr_author_id);
+ANALYZE gr.book_authors;
 
 --- #step Extract author IDs
-CREATE TABLE IF NOT EXISTS gr.author_ids
-  AS SELECT gr_author_rid, (gr_author_data->>'author_id')::int AS gr_author_id
+CREATE TABLE IF NOT EXISTS gr.author_info
+  AS SELECT gr_author_rid, (gr_author_data->>'author_id')::int AS gr_author_id,
+        gr_author_data->>'name' AS author_name
      FROM gr.raw_author;
-CREATE UNIQUE INDEX IF NOT EXISTS gr_author_ridx ON gr.author_ids (gr_author_rid);
-CREATE UNIQUE INDEX IF NOT EXISTS gr_author_idx ON gr.author_ids (gr_author_id);
+CREATE UNIQUE INDEX IF NOT EXISTS gr_author_ridx ON gr.author_info (gr_author_rid);
+CREATE UNIQUE INDEX IF NOT EXISTS gr_author_idx ON gr.author_info (gr_author_id);
+ANALYZE gr.author_info;

+ 69
- 17
scripts/export-goodreads.py

@@ -4,12 +4,17 @@ Export GoodReads-specific data from the book data tools.
 Usage:
     export.py --book-ids
     export.py --work-titles
-    export.py --work-ratings [--implicit]
+    export.py --work-authors
+    export.py --work-genres
+    export.py --work-ratings
+    export.py --work-actions
 """
 
 from pathlib import Path
 from docopt import docopt
 import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
 
 from bookdata import script_log
 from bookdata import db
@@ -19,7 +24,7 @@ _log = script_log('export-goodreads')
 
 def export_book_ids():
     query = '''
-        SELECT gr_book_rid, gr_book_id, gr_work_id, cluster AS book_id
+        SELECT gr_book_rid, gr_book_id, gr_work_id, cluster
         FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
         ORDER BY gr_book_rid
     '''
@@ -33,12 +38,12 @@ def export_book_ids():
     _log.info('writing CSV to %s', csv_fn)
     books.to_csv(csv_fn, index=False)
     _log.info('writing parquet to %s', pq_fn)
-    books.to_parquet(pq_fn, index=False, compression='gzip')
+    books.to_parquet(pq_fn, index=False)
 
 
 def export_work_titles():
     query = f'''
-        SELECT gr_work_rid, gr_work_id, work_title
+        SELECT gr_work_id AS work_id, gr_work_rid, work_title
         FROM gr.work_title
         ORDER BY gr_work_rid
     '''
@@ -49,18 +54,61 @@ def export_work_titles():
 
     pq_fn = 'gr-work-titles.parquet'
     _log.info('writing parquet to %s', pq_fn)
+    books.to_parquet(pq_fn, index=False)
+    _log.info('writing CSV')
+    books.to_csv('gr-work-titles.csv.gz', index=False)
+
+
+def export_work_genres():
+    query = f'''
+        SELECT gr_work_id AS work_id, genre, sum(score::int) AS score
+        FROM gr.book_ids JOIN gr.book_genres USING (gr_book_rid)
+        GROUP BY work_id, genre
+        ORDER BY work_id, genre
+    '''
+
+    with db.connect() as dbc:
+        _log.info('reading work genres')
+        genres = db.load_table(dbc, query)
+
+    pq_fn = 'gr-work-genres.parquet'
+    _log.info('writing parquet to %s', pq_fn)
+    genres.to_parquet(pq_fn, index=False, compression='brotli')
+    _log.info('writing CSV')
+    genres.to_csv('gr-work-genres.csv.gz', index=False)
+
+
+def export_work_authors():
+    query = f'''
+        WITH
+            pairs AS (SELECT DISTINCT gr_work_id AS work_id, gr_author_id
+                      FROM gr.book_ids JOIN gr.book_authors USING (gr_book_rid)
+                      WHERE author_role = '' AND gr_work_id IS NOT NULL)
+        SELECT work_id, gr_author_id AS author_id, author_name
+        FROM pairs JOIN gr.author_info USING (gr_author_id)
+        ORDER BY work_id
+    '''
+
+    with db.connect() as dbc:
+        _log.info('reading work authors')
+        books = db.load_table(dbc, query)
+
+    pq_fn = 'gr-work-authors.parquet'
+    _log.info('writing parquet to %s', pq_fn)
     books.to_parquet(pq_fn, index=False, compression='brotli')
+    _log.info('writing CSV')
+    books.to_csv('gr-work-authors.csv.gz', index=False)
 
 
 def export_work_actions():
     query = '''
-    SELECT gr_user_rid AS user,
-            COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS item,
+    SELECT gr_user_rid AS user, gr_work_id AS item,
             COUNT(rating) AS nactions,
             MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
             MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
      FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
-     GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
+     WHERE gr_work_id IS NOT NULL
+     GROUP BY gr_user_rid, gr_work_id
      ORDER BY MIN(date_updated)
     '''
 
@@ -73,20 +121,20 @@ def export_work_actions():
         })
 
     _log.info('writing actions')
-    actions.to_parquet('gr-work-actions.parquet', index=False, compression='brotli')
+    actions.to_parquet('gr-work-actions.parquet', index=False,
+                       compression='zstd', compression_level=5)
 
 
 def export_work_ratings():
     query = '''
-    SELECT gr_user_rid AS user,
-            COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS item,
+    SELECT gr_user_rid AS user, gr_work_id AS item,
             MEDIAN(rating) AS rating,
             (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
             MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
             COUNT(rating) AS nratings
      FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
-     WHERE rating > 0
-     GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
+     WHERE rating > 0 AND gr_work_id IS NOT NULL
+     GROUP BY gr_user_rid, gr_work_id
      ORDER BY MIN(date_updated)
     '''
 
@@ -101,7 +149,8 @@ def export_work_ratings():
         })
 
     _log.info('writing ratings')
-    ratings.to_parquet('gr-work-ratings.parquet', index=False, compression='brotli')
+    ratings.to_parquet('gr-work-ratings.parquet', index=False,
+                       compression='zstd', compression_level=5)
 
 
 args = docopt(__doc__)
@@ -110,8 +159,11 @@ if args['--book-ids']:
     export_book_ids()
 if args['--work-titles']:
     export_work_titles()
+if args['--work-authors']:
+    export_work_authors()
+if args['--work-genres']:
+    export_work_genres()
 if args['--work-ratings']:
-    if args['--implicit']:
-        export_work_actions()
-    else:
-        export_work_ratings()
+    export_work_ratings()
+if args['--work-actions']:
+    export_work_actions()