Browse Source

simplify + goodreads import

Michael Ekstrand 2 years ago
parent
commit
baaae933e6
10 changed files with 72 additions and 49 deletions
  1. 3
    13
      az-index.sql
  2. 2
    2
      az-schema.sql
  3. 2
    28
      bx-index.sql
  4. 2
    2
      bx-schema.sql
  5. 16
    0
      gr-schema.sql
  6. 7
    1
      gulpfile.js
  7. 1
    1
      lib/bximport.js
  8. 25
    0
      lib/goodreads.js
  9. 1
    2
      loc-index.sql
  10. 13
    0
      ol-explore.sql

+ 3
- 13
az-index.sql

@@ -8,7 +8,7 @@ CREATE TABLE az_users (
   user_key VARCHAR NOT NULL,
   UNIQUE (user_key)
 );
-INSERT INTO az_users (user_key) SELECT DISTINCT user_key FROM az_ratings
+INSERT INTO az_users (user_key) SELECT DISTINCT user_key FROM az_ratings;
 ANALYZE az_users;
 
 INSERT INTO isbn_id (isbn)
@@ -16,18 +16,8 @@ INSERT INTO isbn_id (isbn)
   FROM az_ratings WHERE asin NOT IN (SELECT isbn FROM isbn_id);
 ANALYZE isbn_id;
 
-DROP VIEW IF EXISTS az_loc_ratings;
-CREATE VIEW az_loc_ratings
-  AS SELECT user_id, COALESCE(cluster, bc_of_isbn(isbn_id)) AS book_id,
-       MEDIAN(rating) AS rating, COUNT(rating) AS nratings
-     FROM az_ratings
-       JOIN az_users USING (user_key)
-       JOIN isbn_id ON (isbn = asin)
-       LEFT JOIN loc_isbn_cluster USING (isbn_id)
-     GROUP BY user_id, COALESCE(cluster, bc_of_isbn(isbn_id));
-
-DROP VIEW IF EXISTS az_export_ratings;
-CREATE VIEW az_export_ratings
+DROP VIEW IF EXISTS az_ratings;
+CREATE VIEW az_ratings
   AS SELECT user_id, COALESCE(cluster, bc_of_isbn(isbn_id)) AS book_id,
                      MEDIAN(rating) AS rating, COUNT(rating) AS nratings
      FROM az_ratings

+ 2
- 2
az-schema.sql

@@ -1,5 +1,5 @@
-DROP TABLE IF EXISTS az_ratings CASCADE;
-CREATE TABLE az_ratings (
+DROP TABLE IF EXISTS az_raw_ratings CASCADE;
+CREATE TABLE az_raw_ratings (
   user_key VARCHAR NOT NULL,
   asin VARCHAR NOT NULL,
   rating REAL NOT NULL,

+ 2
- 28
bx-index.sql

@@ -7,32 +7,6 @@ INSERT INTO isbn_id (isbn)
   FROM bx_ratings WHERE isbn NOT IN (SELECT isbn FROM isbn_id);
 ANALYZE isbn_id;
 
-INSERT INTO loc_isbn_book_id (isbn, book_id)
-    WITH bad_isbns AS (SELECT DISTINCT isbn
-                       FROM bx_ratings br
-                       WHERE NOT EXISTS (SELECT * FROM loc_isbn_book_id ib WHERE ib.isbn = br.isbn))
-    SELECT isbn, nextval('loc_synthetic_book_id') FROM bad_isbns;
-ANALYZE loc_isbn_book_id;
-
-DROP VIEW IF EXISTS bx_loc_explicit_ratings;
-CREATE VIEW bx_loc_explicit_ratings
-  AS SELECT user_id, COALESCE(cluster, bc_of_isbn(isbn_id)) AS book_id,
-       MEDIAN(rating) AS rating, COUNT(rating) AS nratings
-     FROM bx_ratings
-       JOIN isbn_id USING (isbn)
-       LEFT JOIN loc_isbn_cluster USING (isbn_id)
-     WHERE rating > 0
-     GROUP BY user_id, book_id;
-
-DROP VIEW IF EXISTS bx_loc_all_ratings;
-CREATE VIEW bx_loc_all_ratings
-  AS SELECT user_id, COALESCE(cluster, bc_of_isbn(isbn_id)) AS book_id,
-       MEDIAN(rating) AS rating, COUNT(rating) AS nratings
-     FROM bx_ratings
-       JOIN isbn_id USING (isbn)
-       LEFT JOIN loc_isbn_cluster USING (isbn_id)
-     GROUP BY user_id, book_id;
-
 DROP VIEW IF EXISTS bx_explicit_ratings;
 CREATE VIEW bx_explicit_ratings
   AS SELECT user_id, COALESCE(cluster, bc_of_isbn(isbn_id)) AS book_id,
@@ -43,8 +17,8 @@ CREATE VIEW bx_explicit_ratings
      WHERE rating > 0
      GROUP BY user_id, book_id;
 
-DROP VIEW IF EXISTS bx_all_ratings;
-CREATE VIEW bx_all_ratings
+DROP VIEW IF EXISTS bx_ratings;
+CREATE VIEW bx_ratings
   AS SELECT user_id, COALESCE(cluster, bc_of_isbn(isbn_id)) AS book_id,
                      MEDIAN(rating) AS rating, COUNT(rating) AS nratings
      FROM bx_ratings

+ 2
- 2
bx-schema.sql

@@ -1,5 +1,5 @@
-DROP TABLE IF EXISTS bx_ratings CASCADE;
-CREATE TABLE bx_ratings (
+DROP TABLE IF EXISTS az_raw_ratings CASCADE;
+CREATE TABLE az_raw_ratings (
   user_id INTEGER NOT NULL,
   isbn VARCHAR NOT NULL,
   rating REAL NOT NULL

+ 16
- 0
gr-schema.sql

@@ -0,0 +1,16 @@
+CREATE TABLE gr_interaction (
+  gr_int_rid SERIAL NOT NULL,
+  gr_int_data JSONB NOT NULL
+);
+CREATE TABLE gr_book (
+  gr_book_rid SERIAL NOT NULL,
+  gr_book_data JSONB NOT NULL
+);
+CREATE TABLE gr_work (
+  gr_work_rid SERIAL NOT NULL,
+  gr_work_data JSONB NOT NULL
+);
+CREATE TABLE gr_author (
+  gr_author_rid SERIAL NOT NULL,
+  gr_author_data JSONB NOT NULL
+);

+ 7
- 1
gulpfile.js

@@ -10,6 +10,7 @@ const args = require('minimist')(process.argv.slice(2));
 
 const olimport = require('./lib/ol-import');
 const lkexport = require('./lib/lkexport');
+const grimport = require('./lib/goodreads')
 
 const olDate = args['ol-date'] || '2017-10-01';
 
@@ -25,11 +26,16 @@ exports.importOpenLib = gulp.parallel(
 exports.importOpenLib.description = 'Import all OpenLib data';
 
 exports.importAmazon = function() {
-  return cp.spawn('psql', ['-c', "\\copy az_ratings FROM 'data/ratings_Books.csv' WITH CSV"], {
+  return cp.spawn('psql', ['-c', "\\copy az_raw_ratings FROM 'data/ratings_Books.csv' WITH CSV"], {
     stdio: ['ignore', process.stdout, process.stderr]
   });
 };
 
+exports.importGoodReads = gulp.parallel(
+  grimport.importAuthors, grimport.importBooks, grimport.importWorks,
+  grimport.importInteractions
+);
+
 exports.importBX = function() {
   const bxi = require('./lib/bximport');
   return bxi('data/BX-Book-Ratings.csv');

+ 1
- 1
lib/bximport.js

@@ -14,7 +14,7 @@ function importBXRatings(fn) {
            }))
            .pipe(through.obj((row, enc, cb) => {
              cb(null, {
-               text: 'INSERT INTO bx_ratings (user_id, isbn, rating) VALUES ($1, $2, $3)',
+               text: 'INSERT INTO bx_raw_ratings (user_id, isbn, rating) VALUES ($1, $2, $3)',
                name: 'insert-rating',
                values: [row['User-ID'], row['ISBN'].replace(/"/, ''), row['Book-Rating']]
              });

+ 25
- 0
lib/goodreads.js

@@ -0,0 +1,25 @@
+const fs = require('fs-extra');
+const zlib = require('zlib');
+const cp = require('child_process');
+
+function importTable(key, file) {
+  let s = fs.createReadStream(file);
+  let p = cp.spawn('psql', ['-c', `\\copy gr_${key} FROM STDIN`], {
+    stdio: ['pipe', process.stdout, process.stderr]
+  });
+  s.pipe(zlib.createGunzip()).pipe(p.stdin);
+  return p;
+}
+
+exports.importBooks = function() {
+  return importTable('book', 'data/goodreads_books.json.gz');
+}
+exports.importInteractions = function() {
+  return importTable('interactions', 'data/goodreads_interactions.json.gz');
+}
+exports.importWorks = function() {
+  return importTable('work', 'data/goodreads_book_works.json.gz');
+}
+exports.importAuthors = function() {
+  return importTable('author', 'data/goodreads_book_authors.json.gz');
+}

+ 1
- 2
loc-index.sql

@@ -73,8 +73,7 @@ INSERT INTO isbn_id (isbn)
   WHERE isbn NOT IN (SELECT isbn FROM isbn_id);
 ANALYZE isbn_id;
 
-DROP TABLE IF EXISTS loc_rec_isbn;
-CREATE TABLE loc_rec_isbn
+CREATE MATERIALIZED VIEW loc_rec_isbn
   AS SELECT rec_id, isbn_id
      FROM loc_book JOIN loc_rec_extracted_isbn USING (rec_id) JOIN isbn_id USING (isbn)
      WHERE isbn IS NOT NULl AND char_length(isbn) IN (10,13);

+ 13
- 0
ol-explore.sql

@@ -0,0 +1,13 @@
+--- Views and such for understanding the contents of the OpenLibrary data
+CREATE MATERIALIZED VIEW ol_edition_json_keys
+     AS SELECT json_key, COUNT(edition_id)
+        FROM (SELECT edition_id, jsonb_object_keys(edition_data) AS json_key
+              FROM ol_edition) eks
+        GROUP BY json_key;
+
+
+CREATE MATERIALIZED VIEW ol_work_json_keys
+     AS SELECT json_key, COUNT(work_id)
+        FROM (SELECT work_id, jsonb_object_keys(work_data) AS json_key
+              FROM ol_work) eks
+        GROUP BY json_key;