Browse Source

cluster information

Michael Ekstrand 8 months ago
parent
commit
7a1b3fc870

+ 1211
- 0
ClusterStats.ipynb

+ 45
- 45
LinkageStats.ipynb

+ 4
- 2
Dvcfile

@@ -4,5 +4,7 @@ deps:
 - path: pgstat://author-info
   md5: fbe8221bb82c3ec262c1297bb5487f80
 - path: pgstat://author-stats
-  md5: 22e072c4b87d34de63a8d90ee66000dc
-md5: 92c6062e6c60af6a55b3231809d18d72
+  md5: fd4d4fb454a0c04cf4135183f0154691
+- path: pgstat://cluster-stats
+  md5: e04cd89b4d82b2c18d1758079631f6cd
+md5: be6c08fbf8d70ecbebc56d4d66fa53fd

+ 2
- 1
environment.yml

@@ -18,7 +18,8 @@ dependencies:
 - sqlalchemy
 - sqlparse
 - rust>=1.40
-- postgresql>=10
+- postgresql>=12
+- notebook
 - requests
 - html5lib
 - gitpython

+ 2
- 0
index/.gitignore

@@ -6,3 +6,5 @@
 /gr-index-books.transcript
 /az-index.transcript
 /bx-index.transcript
+/loc-mds-book-info.transcript
+/ol-book-info.transcript

+ 2
- 2
index/gr-index-books.dvc

@@ -1,8 +1,8 @@
-md5: cc56d0ce3b560c7ead20ffa989ba7187
+md5: eb1aab0498209f394a0f5f0bceffd647
 cmd: python ../run.py sql-script gr-index-books.sql
 deps:
 - path: gr-index-books.sql
-  md5: be7ae4253decf01fa8b9f0ee772d1fb0
+  md5: e935200eeaa893d2112427c06e89ca6e
 - path: pgstat://gr-books
   md5: f5ed1a405492061fa4064874f7ad9b2e
 - path: pgstat://gr-works

+ 8
- 0
index/gr-index-books.sql

@@ -94,3 +94,11 @@ CREATE TABLE IF NOT EXISTS gr.book_genres
      WHERE gr_book_id = (gr_book_genres_data->>'book_id')::int;
 CREATE INDEX bg_book_rid ON gr.book_genres (gr_book_rid);
 CREATE INDEX bg_book_id ON gr.book_genres (gr_book_id);
+
+--- #step Extract GoodReads book titles
+DROP MATERIALIZED VIEW IF EXISTS gr.work_titles;
+CREATE MATERIALIZED VIEW gr.work_titles
+AS SELECT gr_work_rid, (gr_work_data->>'work_id')::int AS gr_work_id,
+  NULLIF(gr_work_data->>'original_title', '') AS work_title
+FROM gr.raw_work;
+CREATE INDEX gr_work_title_work_idx ON gr.work_titles (gr_work_id);

+ 18
- 0
index/loc-mds-book-info.dvc

@@ -0,0 +1,18 @@
+md5: 40c5f68ff5c321936acf34e518bcca8f
+cmd: python ../run.py sql-script loc-mds-book-info.sql
+deps:
+- path: loc-mds-book-info.sql
+  md5: fce49f005e1450890452881a6e71ff40
+- path: pgstat://loc-mds-index-books
+  md5: d638b9a4a77a99749f2214e6400d85a1
+outs:
+- path: pgstat://loc-mds-book-info
+  cache: false
+  md5: be165ab9a7023147ad3f91248839562f
+  metric: false
+  persist: false
+- path: loc-mds-book-info.transcript
+  cache: true
+  metric: false
+  persist: false
+  md5: 6483fcb2bbe15c64e93c1f002a997829

+ 11
- 0
index/loc-mds-book-info.sql

@@ -0,0 +1,11 @@
+--- #dep loc-mds-index-books
+-- Extract more book information
+
+--- #step Extract book titles
+DROP MATERIALIZED VIEW IF EXISTS locmds.book_title;
+CREATE MATERIALIZED VIEW locmds.book_title
+AS SELECT rec_id, contents AS title
+    FROM locmds.book_marc_field
+    WHERE tag = '245' AND sf_code = 'a';
+CREATE INDEX locmds_book_title_rec_ids ON locmds.book_title (rec_id);
+ANALYZE locmds.book_title;

+ 18
- 0
index/ol-book-info.dvc

@@ -0,0 +1,18 @@
+md5: 3d96ddc24b4ba2bf9e9d4c4a9211fff4
+cmd: python ../run.py sql-script ol-book-info.sql
+deps:
+- md5: 3ef46b2a3e878a00e3a5ad7a31aa5a1f
+  path: ol-book-info.sql
+- path: pgstat://ol-index
+  md5: cc0ea4654a7f8396c7b9fa4b8657e1d8
+outs:
+- path: pgstat://ol-book-info
+  cache: false
+  md5: 53d97da4e3f66b18f6d0f0e607a3d964
+  metric: false
+  persist: false
+- md5: 1b6f0f44da8a9595cefbca675086bf71
+  path: ol-book-info.transcript
+  cache: true
+  metric: false
+  persist: false

+ 16
- 0
index/ol-book-info.sql

@@ -0,0 +1,16 @@
+--- #dep ol-index
+-- Extract book information from OpenLibrary
+
+--- #step Extract edition titles
+CREATE MATERIALIZED VIEW IF NOT EXISTS ol.edition_title
+AS SELECT edition_id, edition_data->>'title' AS title
+    FROM ol.edition;
+CREATE INDEX IF NOT EXISTS ol_edition_title_idx ON ol.edition_title (edition_id);
+ANALYZE ol.edition_title;
+
+--- #step Extract work titles
+CREATE MATERIALIZED VIEW IF NOT EXISTS ol.work_title
+AS SELECT work_id, work_data->>'title' AS title
+    FROM ol.work;
+CREATE INDEX IF NOT EXISTS ol_work_title_idx ON ol.work_title (work_id);
+ANALYZE ol.work_title;

+ 1
- 0
integrate/.gitignore

@@ -4,3 +4,4 @@
 /gr-cluster.transcript
 /author-info.transcript
 /author-stats.transcript
+/cluster-stats.transcript

+ 5
- 3
integrate/author-stats.dvc

@@ -1,16 +1,18 @@
-md5: b1056dbc1afa4690b664f166601d5287
+md5: 12bc4a64eca8fe8d87e6135d1b51137a
 cmd: python ../run.py sql-script author-stats.sql
 deps:
+- path: author-stats.sql
+  md5: 45c6cbd78f4eb7c00f56b5568c992934
 - path: pgstat://author-info
   md5: fbe8221bb82c3ec262c1297bb5487f80
 outs:
 - path: pgstat://author-stats
   cache: false
-  md5: 22e072c4b87d34de63a8d90ee66000dc
+  md5: fd4d4fb454a0c04cf4135183f0154691
   metric: false
   persist: false
 - path: author-stats.transcript
   cache: true
   metric: false
   persist: false
-  md5: 49234a450464aa4cfab7d63a71ce805d
+  md5: d51bfb1f4b140edadee709dbdb53425b

+ 1
- 0
integrate/author-stats.sql

@@ -1,4 +1,5 @@
 --- #dep author-info
+--- #table integration_stats
 --- #step Set up statistics table
 DROP TABLE IF EXISTS integration_stats CASCADE;
 CREATE TABLE integration_stats (

+ 18
- 0
integrate/cluster-stats.dvc

@@ -0,0 +1,18 @@
+md5: 65d7c0abd815848714ed9b76de08e30e
+cmd: python ../run.py sql-script cluster-stats.sql
+deps:
+- path: cluster-stats.sql
+  md5: df0c5283146c5989f9fee7ba5b9dfa4d
+- path: pgstat://cluster
+  md5: 4114bb8850b39411d8a700fa490ca3f1
+outs:
+- path: pgstat://cluster-stats
+  cache: false
+  md5: e04cd89b4d82b2c18d1758079631f6cd
+  metric: false
+  persist: false
+- path: cluster-stats.transcript
+  cache: true
+  metric: false
+  persist: false
+  md5: 9519ca91b3eb0b42c920cf36dccfa943

+ 51
- 0
integrate/cluster-stats.sql

@@ -0,0 +1,51 @@
+--- #dep cluster
+--- #table gr.cluster_stats
+--- #table locmds.cluster_stats
+--- #table ol.cluster_stats
+--- #table cluster_stats
+--- #step Count GoodReads cluster statistics
+DROP MATERIALIZED VIEW IF EXISTS gr.cluster_stats CASCADE;
+CREATE MATERIALIZED VIEW gr.cluster_stats AS
+SELECT cluster,
+    COUNT(DISTINCT gr_book_id) AS gr_books,
+    COUNT(DISTINCT gr_work_id) AS gr_works
+FROM gr.book_cluster
+JOIN gr.book_ids USING (gr_book_id)
+GROUP BY cluster;
+CREATE UNIQUE INDEX gr_cluster_stat_cluster_idx ON gr.cluster_stats (cluster);
+ANALYZE gr.cluster_stats;
+
+--- #step Count LOC-MDS cluster statistics
+DROP MATERIALIZED VIEW IF EXISTS locmds.cluster_stats CASCADE;
+CREATE MATERIALIZED VIEW locmds.cluster_stats AS
+SELECT cluster, COUNT(DISTINCT rec_id) AS loc_recs
+FROM isbn_cluster
+JOIN locmds.book_rec_isbn USING (isbn_id)
+GROUP BY cluster;
+CREATE UNIQUE INDEX loc_cluster_stat_cluster_idx ON locmds.cluster_stats(cluster);
+ANALYZE locmds.cluster_stats;
+
+--- #step Count OpenLib cluster statistics
+DROP MATERIALIZED VIEW IF EXISTS ol.cluster_stats CASCADE;
+CREATE MATERIALIZED VIEW ol.cluster_stats AS
+SELECT cluster,
+    COUNT(DISTINCT edition_id) AS ol_editions,
+    COUNT(DISTINCT work_id) AS ol_works
+FROM isbn_cluster
+JOIN ol.isbn_link USING (isbn_id)
+GROUP BY cluster;
+CREATE UNIQUE INDEX ol_cluster_stat_cluster_idx ON ol.cluster_stats(cluster);
+ANALYZE ol.cluster_stats;
+
+--- #step Create joing statistics table
+DROP MATERIALIZED VIEW IF EXISTS cluster_stats CASCADE;
+CREATE MATERIALIZED VIEW cluster_stats AS
+WITH isbn_stats AS (SELECT cluster, COUNT(isbn_id) AS isbns
+                    FROM isbn_cluster
+                    GROUP BY cluster)
+SELECT cluster, isbns, loc_recs, ol_editions, ol_works, gr_books, gr_works
+FROM isbn_stats
+LEFT JOIN locmds.cluster_stats USING (cluster)
+LEFT JOIN gr.cluster_stats USING (cluster)
+LEFT JOIN ol.cluster_stats USING (cluster);
+CREATE UNIQUE INDEX cluster_stat_cluster_idx ON cluster_stats (cluster);

+ 70
- 0
scripts/inspect-json.py

@@ -0,0 +1,70 @@
+"""
+Usage:
+    inspect-json.py --gr-work [ID...]
+    inspect-json.py --gr-book [ID...]
+"""
+
+import sys
+import re
+import json
+from docopt import docopt
+from bookdata import tracking, db, script_log
+
+class GR:
+    def __init__(self, type):
+        self.type = type
+
+    def __call__(self, ids):
+        with db.connect() as dbc, dbc.cursor() as cur:
+            if ids:
+                return [self._id_rec(cur, r) for r in ids]
+            else:
+                return [self._top_rec(cur)]
+
+    def _id_rec(self, cur, id):
+        t = self.type
+        _log.info('fetching %s %s', t, id)
+        q = f'''
+            SELECT gr_{t}_data
+            FROM gr.raw_{t} JOIN gr.{t}_ids USING (gr_{t}_rid)
+            WHERE gr_{t}_id = %s
+        '''
+        cur.execute(q, [id])
+        rec = cur.fetchone()
+        if rec is None:
+            _log.error('%s %s not found', t, id)
+        else:
+            return rec[0]
+
+    def _top_rec(self, cur):
+        t = self.type
+        _log.info('fetching one %s', t)
+        q = f'SELECT gr_{t}_data FROM gr.raw_{t} LIMIT 1'
+        cur.execute(q)
+        data, = cur.fetchone()
+        _log.debug('got %r', data)
+        return data
+
+
+__gr_work = GR('work')
+__gr_book = GR('book')
+
+_log = script_log(__name__)
+opts = docopt(__doc__)
+
+rec_ids = opts.get('ID', None)
+if rec_ids:
+    rec_ids = [int(r) for r in rec_ids]
+
+recs = None
+for k in opts.keys():
+    fn = k.replace('-', '_')
+    if k.startswith('--') and opts[k] and fn in globals():
+        f = globals()[fn]
+        recs = f(rec_ids)
+
+if recs is None:
+    _log.error('could not find an operation to perform')
+
+for rec in recs:
+    print(json.dumps(rec, indent=2))