Browse Source

Export titles

Michael Ekstrand 2 months ago
parent
commit
6bc65ecadd
5 changed files with 75 additions and 35 deletions
  1. 2
    2
      bookdata/db.py
  2. 1
    0
      export/.gitignore
  3. 15
    2
      export/dvc.lock
  4. 7
    0
      export/dvc.yaml
  5. 50
    31
      scripts/export-goodreads.py

+ 2
- 2
bookdata/db.py

@@ -435,7 +435,7 @@ class _LoadThread(threading.Thread):
             cur.copy_expert(self.query, self.chan)
 
 
-def load_table(dbc, query):
+def load_table(dbc, query, **kwargs):
     """
     Load a query into a Pandas data frame.
 
@@ -446,7 +446,7 @@ def load_table(dbc, query):
     q = sql.SQL(query)
     thread = _LoadThread(dbc, cq.format(q))
     thread.start()
-    data = pd.read_csv(thread.reader)
+    data = pd.read_csv(thread.reader, **kwargs)
     thread.join()
     return data
 

+ 1
- 0
export/.gitignore

@@ -1,2 +1,3 @@
 /gr-book-ids.csv.gz
 /gr-book-ids.parquet
+/gr-work-ratings.parquet

+ 15
- 2
export/dvc.lock

@@ -7,8 +7,8 @@ stages:
       md5: 3fe0d48fd17efd5e670f6b2649fc2286
       size: 357
     - path: ../scripts/export-goodreads.py
-      md5: 1819545419b65dc4009b2dfe68d02d68
-      size: 2930
+      md5: 057af6ab6c6f122db0c351f1b5fe77c8
+      size: 3438
     outs:
     - path: gr-book-ids.csv.gz
       md5: 84f072c02f669ae9aca3f754957d787f
@@ -16,3 +16,16 @@ stages:
     - path: gr-book-ids.parquet
       md5: 5f0c7ea28a747530ed19543d4d58c4b5
       size: 22506875
+  gr-work-ratings:
+    cmd: python ../run.py export-goodreads --work-ratings
+    deps:
+    - path: ../index/gr-index-ratings.status
+      md5: 2e5bab21dcd1a3ad70c7b9aa33e1f929
+      size: 393
+    - path: ../scripts/export-goodreads.py
+      md5: 057af6ab6c6f122db0c351f1b5fe77c8
+      size: 3438
+    outs:
+    - path: gr-work-ratings.parquet
+      md5: f0705ef831eae32ee7b616bcb4817c11
+      size: 701939313

+ 7
- 0
export/dvc.yaml

@@ -7,3 +7,10 @@ stages:
     outs:
     - gr-book-ids.csv.gz
     - gr-book-ids.parquet
+  gr-work-ratings:
+    cmd: python ../run.py export-goodreads --work-ratings
+    deps:
+    - ../index/gr-index-ratings.status
+    - ../scripts/export-goodreads.py
+    outs:
+    - gr-work-ratings.parquet

+ 50
- 31
scripts/export-goodreads.py

@@ -3,6 +3,7 @@ Export GoodReads-specific data from the book data tools.
 
 Usage:
     export.py --book-ids
+    export.py --work-titles
     export.py --work-ratings [--implicit]
 """
 
@@ -13,7 +14,7 @@ import pandas as pd
 from bookdata import script_log
 from bookdata import db
 
-_log = script_log(__file__)
+_log = script_log('export-goodreads')
 
 
 def export_book_ids():
@@ -22,8 +23,9 @@ def export_book_ids():
         FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
         ORDER BY gr_book_rid
     '''
-    _log.info('reading book IDs')
+
     with db.connect() as dbc:
+        _log.info('reading book IDs')
         books = db.load_table(dbc, query)
 
     csv_fn = 'gr-book-ids.csv.gz'
@@ -34,12 +36,26 @@ def export_book_ids():
     books.to_parquet(pq_fn, index=False, compression='gzip')
 
 
-def export_work_actions():
-    path = data_dir / 'GR-I' / 'work-ratings.parquet'
-
+def export_work_titles():
     query = f'''
-    SELECT gr_user_rid AS user_id,
-            COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS book_id,
+        SELECT gr_work_rid, gr_work_id, work_title
+        FROM gr.work_title
+        ORDER BY gr_work_rid
+    '''
+
+    with db.connect() as dbc:
+        _log.info('reading work titles')
+        books = db.load_table(dbc, query)
+
+    pq_fn = 'gr-work-titles.parquet'
+    _log.info('writing parquet to %s', pq_fn)
+    books.to_parquet(pq_fn, index=False, compression='brotli')
+
+
+def export_work_actions():
+    query = '''
+    SELECT gr_user_rid AS user,
+            COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS item,
             COUNT(rating) AS nactions,
             MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
             MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
@@ -47,24 +63,23 @@ def export_work_actions():
      GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
      ORDER BY MIN(date_updated)
     '''
-    _log.info('reading book shelf actions')
-    actions = dt.load_table(query, dtype={
-        'user': 'i4',
-        'item': 'i4',
-        'nactions': 'i4'
-    })
 
-    path.parent.mkdir(parents=True, exist_ok=True)
-    _log.info('writing ratings to %s', path)
-    actions.to_parquet(path, index=False)
+    with db.connect() as dbc:
+        _log.info('reading book shelf actions')
+        actions = db.load_table(dbc, query, dtype={
+            'user': 'i4',
+            'item': 'i4',
+            'nactions': 'i4'
+        })
 
+    _log.info('writing actions')
+    actions.to_parquet('gr-work-actions.parquet', index=False, compression='brotli')
 
-def export_work_ratings():
-    path = data_dir / 'GR-E' / 'work-ratings.parquet'
 
-    query = f'''
-    SELECT gr_user_rid AS user_id,
-            COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS book_id,
+def export_work_ratings():
+    query = '''
+    SELECT gr_user_rid AS user,
+            COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS item,
             MEDIAN(rating) AS rating,
             (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
             MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
@@ -74,23 +89,27 @@ def export_work_ratings():
      GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
      ORDER BY MIN(date_updated)
     '''
-    _log.info('reading book ratings')
-    ratings = dt.load_table(query, dtype={
-        'user': 'i4',
-        'item': 'i4',
-        'rating': 'f4',
-        'nactions': 'i4'
-    })
 
-    path.parent.mkdir(parents=True, exist_ok=True)
-    _log.info('writing ratings to %s', path)
-    ratings.to_parquet(path, index=False)
+    with db.connect() as dbc:
+        _log.info('reading book ratings')
+        ratings = db.load_table(dbc, query, dtype={
+            'user': 'i4',
+            'item': 'i4',
+            'rating': 'f4',
+            'last_rating': 'f4',
+            'nratings': 'i4'
+        })
+
+    _log.info('writing ratings')
+    ratings.to_parquet('gr-work-ratings.parquet', index=False, compression='brotli')
 
 
 args = docopt(__doc__)
 
 if args['--book-ids']:
     export_book_ids()
+if args['--work-titles']:
+    export_work_titles()
 if args['--work-ratings']:
     if args['--implicit']:
         export_work_actions()