Browse Source

Merge branch 'master' into inspect

Michael Ekstrand 1 year ago
parent
commit
b197dc7d6b

+ 0
- 24
.vscode/launch.json

@@ -1,24 +0,0 @@
-{
-    // Use IntelliSense to learn about possible Node.js debug attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "type": "node",
-            "request": "launch",
-            "name": "Launch Program",
-            "program": "${workspaceRoot}\\import\\test-scan.js",
-            "args": [
-                "data\\ol_dump_authors_2016-07-31.txt.gz"
-            ],
-            "cwd": "${workspaceRoot}"
-        },
-        {
-            "type": "node",
-            "request": "attach",
-            "name": "Attach to Process",
-            "port": 5858
-        }
-    ]
-}

+ 7
- 3
Dvcfile

@@ -1,10 +1,14 @@
 deps:
-- path: pgstat://loc-mds-index-names
-  md5: 4d8d4e061447b2d53f39262c41e60f74
+- path: pgstat://loc-mds-book-info
+  md5: be165ab9a7023147ad3f91248839562f
+- path: pgstat://gr-book-info
+  md5: f2f75c4bf2dea889664ebb4a285a957a
 - path: pgstat://author-info
   md5: fbe8221bb82c3ec262c1297bb5487f80
 - path: pgstat://author-stats
   md5: fd4d4fb454a0c04cf4135183f0154691
 - path: pgstat://cluster-stats
   md5: e04cd89b4d82b2c18d1758079631f6cd
-md5: be6c08fbf8d70ecbebc56d4d66fa53fd
+- path: pgstat://loc-mds-index-names
+  md5: 4d8d4e061447b2d53f39262c41e60f74
+md5: aebfeacca0a62b07309f02696569231c

+ 1
- 1
README.md

@@ -2,7 +2,7 @@ This repository contains the code to import and integrate the book and rating da
 It imports and integrates data from several sources in a single PostgreSQL database; import scripts
 are primarily in Python, with Rust code for high-throughput processing of raw data files.
 
-If you use these scripts in any published reseaerch, cite [our paper](https://md.ekstrandom.net/pubs/book-author-gender):
+If you use these scripts in any published research, cite [our paper](https://md.ekstrandom.net/pubs/book-author-gender):
 
 > Michael D. Ekstrand, Mucun Tian, Mohammed R. Imran Kazi, Hoda Mehrpouyan, and Daniel Kluver. 2018. Exploring Author Gender in Book Rating and Recommendation. In *Proceedings of the 12th ACM Conference on Recommender Systems* (RecSys '18). ACM, pp. 242–250. DOI:[10.1145/3240323.3240373](https://doi.org/10.1145/3240323.3240373). arXiv:[1808.07586v1](https://arxiv.org/abs/1808.07586v1) [cs.IR].
 

+ 1
- 0
docs/.gitignore

@@ -0,0 +1 @@
+/_site

+ 1
- 0
docs/CNAME

@@ -0,0 +1 @@
+bookdata.piret.info

+ 6
- 0
docs/Gemfile

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+
+gem "github-pages", group: :jekyll_plugins
+gem "jekyll-include-cache", group: :jekyll_plugins
+
+gem 'wdm', '>= 0.1.0' if Gem.win_platform?

+ 253
- 0
docs/Gemfile.lock

@@ -0,0 +1,253 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (6.0.2.2)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+      zeitwerk (~> 2.2)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    coffee-script (2.4.1)
+      coffee-script-source
+      execjs
+    coffee-script-source (1.11.1)
+    colorator (1.1.0)
+    commonmarker (0.17.13)
+      ruby-enum (~> 0.5)
+    concurrent-ruby (1.1.6)
+    dnsruby (1.61.3)
+      addressable (~> 2.5)
+    em-websocket (0.5.1)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0.6.0)
+    ethon (0.12.0)
+      ffi (>= 1.3.0)
+    eventmachine (1.2.7-x64-mingw32)
+    execjs (2.7.0)
+    faraday (1.0.0)
+      multipart-post (>= 1.2, < 3)
+    ffi (1.12.2-x64-mingw32)
+    forwardable-extended (2.6.0)
+    gemoji (3.0.1)
+    github-pages (204)
+      github-pages-health-check (= 1.16.1)
+      jekyll (= 3.8.5)
+      jekyll-avatar (= 0.7.0)
+      jekyll-coffeescript (= 1.1.1)
+      jekyll-commonmark-ghpages (= 0.1.6)
+      jekyll-default-layout (= 0.1.4)
+      jekyll-feed (= 0.13.0)
+      jekyll-gist (= 1.5.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-mentions (= 1.5.1)
+      jekyll-optional-front-matter (= 0.3.2)
+      jekyll-paginate (= 1.1.0)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.15.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.1)
+      jekyll-sass-converter (= 1.5.2)
+      jekyll-seo-tag (= 2.6.1)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.1.1)
+      jekyll-theme-cayman (= 0.1.1)
+      jekyll-theme-dinky (= 0.1.1)
+      jekyll-theme-hacker (= 0.1.1)
+      jekyll-theme-leap-day (= 0.1.1)
+      jekyll-theme-merlot (= 0.1.1)
+      jekyll-theme-midnight (= 0.1.1)
+      jekyll-theme-minimal (= 0.1.1)
+      jekyll-theme-modernist (= 0.1.1)
+      jekyll-theme-primer (= 0.5.4)
+      jekyll-theme-slate (= 0.1.1)
+      jekyll-theme-tactile (= 0.1.1)
+      jekyll-theme-time-machine (= 0.1.1)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.11.1)
+      kramdown (= 1.17.0)
+      liquid (= 4.0.3)
+      mercenary (~> 0.3)
+      minima (= 2.5.1)
+      nokogiri (>= 1.10.4, < 2.0)
+      rouge (= 3.13.0)
+      terminal-table (~> 1.4)
+    github-pages-health-check (1.16.1)
+      addressable (~> 2.3)
+      dnsruby (~> 1.60)
+      octokit (~> 4.0)
+      public_suffix (~> 3.0)
+      typhoeus (~> 1.3)
+    html-pipeline (2.12.3)
+      activesupport (>= 2)
+      nokogiri (>= 1.4)
+    http_parser.rb (0.6.0)
+    i18n (0.9.5)
+      concurrent-ruby (~> 1.0)
+    jekyll (3.8.5)
+      addressable (~> 2.4)
+      colorator (~> 1.0)
+      em-websocket (~> 0.5)
+      i18n (~> 0.7)
+      jekyll-sass-converter (~> 1.0)
+      jekyll-watch (~> 2.0)
+      kramdown (~> 1.14)
+      liquid (~> 4.0)
+      mercenary (~> 0.3.3)
+      pathutil (~> 0.9)
+      rouge (>= 1.7, < 4)
+      safe_yaml (~> 1.0)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-coffeescript (1.1.1)
+      coffee-script (~> 2.2)
+      coffee-script-source (~> 1.11.1)
+    jekyll-commonmark (1.3.1)
+      commonmarker (~> 0.14)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-commonmark-ghpages (0.1.6)
+      commonmarker (~> 0.17.6)
+      jekyll-commonmark (~> 1.2)
+      rouge (>= 2.0, < 4.0)
+    jekyll-default-layout (0.1.4)
+      jekyll (~> 3.0)
+    jekyll-feed (0.13.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-gist (1.5.0)
+      octokit (~> 4.2)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
+      octokit (~> 4.0, != 4.4.0)
+    jekyll-include-cache (0.2.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-mentions (1.5.1)
+      html-pipeline (~> 2.3)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-paginate (1.1.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.15.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.1)
+      addressable (~> 2.0)
+      jekyll (>= 3.5, < 5.0)
+      rubyzip (>= 1.3.0)
+    jekyll-sass-converter (1.5.2)
+      sass (~> 3.4)
+    jekyll-seo-tag (2.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-cayman (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-dinky (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-hacker (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-leap-day (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-merlot (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-midnight (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-minimal (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-modernist (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-primer (0.5.4)
+      jekyll (> 3.5, < 5.0)
+      jekyll-github-metadata (~> 2.9)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-slate (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-tactile (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-time-machine (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    jemoji (0.11.1)
+      gemoji (~> 3.0)
+      html-pipeline (~> 2.2)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (1.17.0)
+    liquid (4.0.3)
+    listen (3.2.1)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
+    mercenary (0.3.6)
+    mini_portile2 (2.4.0)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-feed (~> 0.9)
+      jekyll-seo-tag (~> 2.1)
+    minitest (5.14.0)
+    multipart-post (2.1.1)
+    nokogiri (1.10.9-x64-mingw32)
+      mini_portile2 (~> 2.4.0)
+    octokit (4.18.0)
+      faraday (>= 0.9)
+      sawyer (~> 0.8.0, >= 0.5.3)
+    pathutil (0.16.2)
+      forwardable-extended (~> 2.6)
+    public_suffix (3.1.1)
+    rb-fsevent (0.10.3)
+    rb-inotify (0.10.1)
+      ffi (~> 1.0)
+    rouge (3.13.0)
+    ruby-enum (0.7.2)
+      i18n
+    rubyzip (2.3.0)
+    safe_yaml (1.0.5)
+    sass (3.7.4)
+      sass-listen (~> 4.0.0)
+    sass-listen (4.0.0)
+      rb-fsevent (~> 0.9, >= 0.9.4)
+      rb-inotify (~> 0.9, >= 0.9.7)
+    sawyer (0.8.2)
+      addressable (>= 2.3.5)
+      faraday (> 0.8, < 2.0)
+    terminal-table (1.8.0)
+      unicode-display_width (~> 1.1, >= 1.1.1)
+    thread_safe (0.3.6)
+    typhoeus (1.3.1)
+      ethon (>= 0.9.0)
+    tzinfo (1.2.6)
+      thread_safe (~> 0.1)
+    unicode-display_width (1.7.0)
+    wdm (0.1.1)
+    zeitwerk (2.3.0)
+
+PLATFORMS
+  x64-mingw32
+
+DEPENDENCIES
+  github-pages
+  jekyll-include-cache
+  wdm (>= 0.1.0)
+
+BUNDLED WITH
+   1.17.2

+ 15
- 0
docs/_config.yml

@@ -0,0 +1,15 @@
+title: Book Data Tools
+name: Michael Ekstrand
+
+remote_theme: pmarsceill/just-the-docs
+
+aux_links:
+  "GitHub": https://github.com/BoiseState/bookdata-tools
+  "PIReT": https://piret.info
+
+footer_content: >
+  Copyright &copy; 2020 Boise State University.  Distributed under the MIT License.
+  This material is based upon work supported by the National Science Foundation under
+  Grant No. IIS 17-51278. Any opinions, findings, and conclusions or recommendations
+  expressed in this material are those of the author(s) and do not necessarily reflect
+  the views of the National Science Foundation.

+ 2
- 0
docs/_includes/head_custom.html

@@ -0,0 +1,2 @@
+<link rel=stylesheet type="text/css" href="https://unpkg.com/@openfonts/lato_latin/index.css">
+<link rel=stylesheet type="text/css" href="https://unpkg.com/@openfonts/source-code-pro_latin/index.css">

+ 2
- 0
docs/_sass/custom/custom.scss

@@ -0,0 +1,2 @@
+$body-font-family: 'Lato', sans-serif;
+$mono-font-family: 'Source Code Pro', monospace;

+ 83
- 0
docs/data/amazon.md

@@ -0,0 +1,83 @@
+---
+title: Amazon
+parent: Data Model
+nav_order: 6
+---
+
+# Amazon Ratings
+{: .no_toc}
+
+The [Amazon reviews data set](http://jmcauley.ucsd.edu/data/amazon/) consists of user-provided
+reviews and ratings for a variety of products.
+
+Currently we import the ratings-only data from the Books segment of the 2014 data set.
+
+**If you use this data, cite the paper(s) documented on the data set web site.**
+
+Imported data lives in the `az` schema.  The source files are not automatically downloaded.
+
+1. TOC
+{:toc}
+
+## Data Model Diagram
+
+![Amazon data model](az.svg)
+
+- [SVG file](az.svg)
+- [PlantUML source](az.puml)
+
+## Import Steps
+
+The import is controlled by the following DVC steps:
+
+`schemas/az-schema.dvc`
+:   Run `az-schema.sql` to set up the base schema.
+
+`import/az-ratings.dvc`
+:   Import raw BookCrossing ratings from `data/ratings_Books.csv`.
+
+`index/az-index.dvc`
+:   Run `az-index.sql` to index the rating data and integrate with book data.
+
+## Raw Data
+
+The raw rating data, with invalid characters cleaned up, is in the `az.raw_ratings` table, with
+the following columns:
+
+user_key
+:   The alphanumeric user identifier.
+
+asin
+:   The Amazon identification number for the product; for a book with an ISBN, this is the ISBN.
+
+rating
+:   The book rating.  The ratings are on a 1-5 scale.
+
+rating_time
+:   The rating timestamp.
+
+## Extracted Rating Tables
+
+We extract the following tables for Amazon ratings:
+
+`user_ids`
+:   Mapping from Amazon's alphanumeric user identifiers to numeric user IDs.
+
+`rating`
+:   Rating values suitable for LensKit use, with numeric user and item identifiers. The ratings are
+    pre-clustered, so the book IDs refer to book clusters and not individual ISBNs or editions.
+    This table has the following columns:
+
+    `user_id`
+    :   The user ID.
+
+    `book_id`
+    :   The [book code](ids.html#book-codes) for this book; the cluster identifier if available, or the
+        ISBN-based book code if this book is not in a cluster.
+
+    `rating`
+    :   The rating value; if the user has rated multiple books in a cluster, the median value is reported.
+
+    `nactions`
+    :   The number of book actions this user performed on this book.  Equivalent to the number of books in
+        the cluster that the user has rated.

+ 49
- 0
docs/data/az.puml

@@ -0,0 +1,49 @@
+@startuml
+
+title AZ Ratings
+skinparam linetype ortho
+!include theme.iuml
+
+entity isbn_id {
+  isbn_id
+  --
+  isbn
+}
+
+entity isbn_cluster {
+  cluster
+  --
+  isbn_id
+}
+
+isbn_cluster }|- isbn_id
+
+entity raw_ratings {
+    --
+    user_key
+    asin
+    rating
+    rating_time
+}
+
+entity user_ids <<derived>> {
+    user_id
+    --
+    user_key
+}
+
+entity rating <<derived>> {
+    user_id
+    book_id
+    --
+    rating
+    nactions
+}
+
+raw_ratings <. rating
+raw_ratings <. user_ids
+
+rating }|-- user_ids
+rating }|- isbn_cluster : book_id:cluster
+
+@enduml

File diff suppressed because it is too large
+ 79
- 0
docs/data/az.svg

+ 90
- 0
docs/data/bx.md

@@ -0,0 +1,90 @@
+---
+title: BookCrossing
+parent: Data Model
+nav_order: 5
+---
+
+# BookCrossing
+{: .no_toc}
+
+The [BookCrossing data set](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) consists of user-provided
+ratings — both implicit and explicit — of books.
+
+**If you use this data, cite:**
+
+> Cai-Nicolas Ziegler, Sean M. McNee, Joseph A. Konstan, and Georg Lausen. 2005. Improving Recommendation Lists Through Topic Diversification. Proceedings of the 14th International World Wide Web Conference (WWW '05), May 10-14, 2005, Chiba, Japan.
+
+Imported data lives in the `bx` schema.  The source data files are automatically downloaded and unpacked by
+the provided scripts and DVC stages.
+
+1. TOC
+{:toc}
+
+## Data Model Diagram
+
+![BookCrossing data model](bx.svg)
+
+- [SVG file](bx.svg)
+- [PlantUML source](bx.puml)
+
+## Import Steps
+
+The import is controlled by the following DVC steps:
+
+`data/BX.dvc`
+:   Unpack the BookCrossing zip file.
+
+`data/BX-CSV-Dump.zip.dvc`
+:   Download the BookCrossing zip file.s
+
+`schemas/bx-schema.dvc`
+:   Run `bx-schema.sql` to set up the base schema.
+
+`import/bx-ratings.dvc`
+:   Import raw BookCrossing ratings from `data/BX-Book-Ratings.csv`.
+
+`index/bx-index.dvc`
+:   Run `bx-index.sql` to index the rating data and integrate with book data.
+
+## Raw Data
+
+The raw rating data, with invalid characters cleaned up, is in the `bx.raw_ratings` table, with
+the following columns:
+
+user_id
+:   The user identifier (numeric).
+
+isbn
+:   The book ISBN (text).
+
+rating
+:   The book rating.  The ratings are on a 1-10 scale, with 0 indicating an implicit-feedback record.
+
+## Extracted Rating Tables
+
+We extract the following tables for BookCrossing ratings:
+
+`rating`
+:   The explicit ratings (`rating > 0`) from the raw ratings table.
+
+`add_action`
+:   Records of users adding books, either by rating or through an implicit feedback action,
+    without rating values.
+
+Both of these tables are pre-clustered, so the book IDs refer to book clusters and not individual
+ISBNs or editions.  They have the following columns:
+
+`user_id`
+:   The user ID.
+
+`book_id`
+:   The [book code](ids.html#book-codes) for this book; the cluster identifier if available, or the
+    ISBN-based book code if this book is not in a cluster.
+
+`rating`
+:   The rating value; if the user has rated multiple books in a cluster, the median value is reported.
+    This field is only on the `rating` table.
+
+`nactions`
+:   The number of book actions this user performed on this book.  Equivalent to the number of books in
+    the cluster that the user has added or rated.

+ 49
- 0
docs/data/bx.puml

@@ -0,0 +1,49 @@
+@startuml
+
+title BookCrossing Ratings
+skinparam linetype ortho
+!include theme.iuml
+
+entity isbn_id {
+  isbn_id
+  --
+  isbn
+}
+
+entity isbn_cluster {
+  cluster
+  --
+  isbn_id
+}
+
+isbn_cluster }|- isbn_id
+
+entity raw_ratings {
+    --
+    user_id
+    isbn
+    rating
+}
+
+entity rating <<derived>> {
+    user_id
+    book_id
+    --
+    rating
+    nactions
+}
+
+entity add_action <<derived>> {
+    user_id
+    book_id
+    --
+    nactions
+}
+
+raw_ratings <.. rating
+raw_ratings <. add_action
+
+rating }|- isbn_cluster : book_id:cluster
+add_action }|-- isbn_cluster : book_id:cluster
+
+@enduml

File diff suppressed because it is too large
+ 79
- 0
docs/data/bx.svg

+ 57
- 0
docs/data/cluster.md

@@ -0,0 +1,57 @@
+---
+title: Clusters
+parent: Data Model
+nav_order: 8
+---
+
+# Book Clusters
+{: .no_toc}
+
+For recommendation and analysis, we often want to look at *works* instead of individual books or
+editions of those books.  The same material by the same author(s) may be reprinted in many different
+editions, with different ISBNs, and sometimes separate ratings from the same user.
+
+There are a variety of ways to deal with this.  GoodReads and OpenLibrary both have the concept of
+a ‘work’ to group together related editions (the Library of Congress also has such a concept
+internally in its BIBFRAME schema, but that data is not currently available for integration).
+
+Other services, such as [ThingISBN](https://blog.librarything.com/thingology/2006/06/introducing-thingisbn/)
+and OCLC's [xISBN](https://www.worldcat.org/affiliate/webservices/xisbn/app.jsp) both link together ISBNs:
+given a query ISBN, they will return a list of ISBNs believed to be for the same book.
+
+Using the book data sources here, we have implemented comparable functionality in a manner that
+anyone can reproduce from public data.  We call the resulting equivalence sets ‘book clusters’.
+
+## Clustering Algorithm
+
+Our clustering algorithm begins by forming a bipartite graph of ISBNs and record identifiers.  We extract
+records from the following:
+
+- Library of Congress book records
+- OpenLibrary editions. If the edition has an associated work, we use the work identifier instead of
+  the book identifier.
+- GoodReads books.  If the book has an associated work, we use the work identifier instead.
+
+We convert each record identifier to a [book code](ids.html#book-codes) to avoid confusion between
+different identifier types (and keep ID number reuse between data sets from colliding).
+
+There is an edge from an ISBN to a record if that record reports the ISBN as one of its identifiers.
+
+We then compute connected components on this graph, and treat each connected component as a single
+‘book’ (what we call a *book cluster*).  The cluster is identified by the smallest book code of any
+of the book records it comprises, but these cluster identifiers shouldn't be treated as meaningful.
+
+The idea is that if two ISBNs appear together on a book record, that is evidence they are for the
+same book; likewise, if two book records have the same ISBN, it is evidence they record the same book.
+Pooling this evidence across all data sources maximizes the ability to detect book clusters.
+
+The `isbn_cluster` table maps each ISBN to its associated cluster.  Individual data sources may also
+have an `isbn_cluster` table (e.g. `gr.isbn_cluster`); that is the result of clustering ISBNs using
+only the book records from that data source.  However, all clustered results such as rating tables
+are based on the all-source book clusters.
+
+## Known Problems
+
+Some book sets have ISBNs, which cause them link together books that should not be clustered.
+The Library of Congress identifies many of these ISBNs as set ISBNs, and we are examining the
+prospect of using this to exclude them from clustering.

+ 151
- 0
docs/data/goodreads.md

@@ -0,0 +1,151 @@
+---
+title: GoodReads
+parent: Data Model
+nav_order: 7
+---
+
+# GoodReads (UCSD Book Graph)
+{: .no_toc}
+
+We import GoodReads data from the [UCSD Book Graph](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home)
+for additional book and user interaction information.  The source files are not automatically downloaded; you
+will need the following:
+
+- Books
+- Book works
+- Authors
+- Book genres
+- Book series
+- **Full** interaction data
+
+We do not yet support reviews.
+
+**If you use this data, cite the paper(s) documented on the data set web site.**
+
+Imported data lives in the `gr` schema.
+
+1. TOC
+{:toc}
+
+## Data Model Diagram
+
+![GoodReads model diagram](goodreads.svg)
+
+- [SVG file](goodreads.svg)
+- [PlantUML source](goodreads.puml)
+
+## Import Steps
+
+The import is controlled by the following DVC steps:
+
+`schemas/gr-schema.dvc`
+:   Run `gr-schema.sql` to set up the base schema.
+
+`import/gr-*.dvc`
+:   Import raw GoodReads data from files under `data/`
+
+`index/gr-index-books.dvc`
+:   Run `gr-index-books.sql` to index the book data and extract identifiers.
+
+`index/gr-book-info.dvc`
+:   Run `gr-book-info.sql` to extract additional book and work metadata.
+
+`index/gr-index-ratings.dvc`
+:   Run `gr-index-ratings.sql` to index the rating and interaction data.
+
+## Raw Data
+
+The raw rating data, with invalid characters cleaned up, is in the various `gr.raw_*` tables:
+
+- `raw_book`
+- `raw_work`
+- `raw_author`
+- `raw_series`
+- `raw_book_genres`
+- `raw_interaction`
+
+Each table has the following columns:
+
+gr_*type*_rid
+:   Numeric record identifier generated at import time.  Throughout this page, we will refer to these as record identifiers; they are distinct from the identifiers GoodReads uses for books and works, as those are not known until the JSON is unpacked.
+
+gr_*type*_data
+:   `JSONB` column containing imported data.
+
+## Extracted Book Tables
+
+We extract the following tables for book and work data:
+
+`work_ids`
+:   GoodReads work identifiers.
+
+`book_ids`
+:   GoodReads book identifiers.  This maps each GoodReads book record identifier to the following identifiers:
+
+    - book ID
+    - work ID
+    - ASIN
+    - ISBN 10 (`gr_isbn`)
+    - ISBN 13 (`gr_isbn13`)
+
+    This table extracts the *textual* versions of ISBNs and ASINs directly from the `raw_book` table.  It does not resolve them to ISBN IDs.
+
+`book_isbn`
+:   Map GoodReads books to ISBN IDs and book codes.  This does **not** use ASINs, just ISBN-10 and ISBN-13s.
+
+`book_genres`
+:   Genre membership (and scores) for each book.  This is a direct extract of the book genres file from UCSD.
+
+`work_title`
+:   The title of each work.
+
+`book_pub_date`
+:   The publication date of each book.  It extracts the year, month, and day; if all three are present, then `pub_date` contains the date as an SQL date.  These are the `publication_*` fields in the book JSON data.
+
+`work_pub_date`
+:   The original publication date of each work.  Extracted like `book_pub_date`, but from a work's `original_publication_*` fields.
+
+`book_cluster`
+:   The book cluster each book is a member of.
+
+## Extracted Interaction Tables
+
+We extract the following tables for book ratings and interactions (add-to-shelf actions):
+
+`user_info`
+:   Mapping between user record IDs and GoodReads user IDs.
+
+`interaction`
+:   Extract of basic information about each entry in the Interactions file.  These interactions
+    represent an add-to-shelf action, optionally with a rating.  We extract the following:
+
+    `gr_interaction_rid`
+    :   The interaction record identifier (PK)
+
+    `gr_book_id`
+    :   GoodReads book ID
+
+    `gr_user_rid`
+    :   User record identifier (we use record IDs instead of user IDs to keep them numeric)
+
+    `rating`
+    :   The 5-star rating value (if provided)
+
+    `is_read`
+    :   `isRead` flag from original JSON data.
+
+    `date_added`
+    :   The date the book was added to the shelf.
+
+    `date_updated`
+    :   The update date for this interaction.
+
+`rating`
+:   Rating table suitable for use in LensKit.  This is aggregated
+    by book cluster, and contains both the median rating and the
+    last rating, along with the median update date as the timestamp.
+
+`add_action`
+:   Add-action table suitable for use in LensKit.  Also aggregated by book cluster,
+    with the first and last (update) date as the timestamps, and number of interactions
+    with this book.

+ 192
- 0
docs/data/goodreads.puml

@@ -0,0 +1,192 @@
+@startuml
+
+title GoodReads Data
+skinparam linetype ortho
+!include theme.iuml
+
+entity isbn_id {
+  isbn_id
+  --
+  isbn
+}
+
+entity isbn_cluster {
+  cluster
+  --
+  isbn_id
+}
+
+isbn_id -|{ isbn_cluster
+
+entity raw_book {
+    gr_book_rid
+    --
+    gr_book_data
+}
+entity raw_work {
+    gr_work_rid
+    --
+    gr_work_data
+}
+/'
+entity raw_author {
+    gr_author_rid
+    --
+    gr_author_data
+}
+entity raw_series {
+    gr_series_rid
+    --
+    gr_series_data
+}
+'/
+entity raw_book_genres {
+    gr_book_genres_rid
+    --
+    gr_book_genres_data
+}
+entity raw_interaction {
+    gr_interaction_rid
+    --
+    gr_interaction_data
+}
+
+entity work_ids <<derived>> {
+    gr_work_rid
+    --
+    gr_work_id
+}
+
+raw_work <.. work_ids
+raw_work ||--|| work_ids
+
+entity book_ids <<derived>> {
+    gr_book_rid
+    --
+    gr_work_id
+    # gr_book_id
+    gr_asin
+    gr_isbn
+    gr_isbn13
+}
+
+raw_book <.. book_ids
+raw_book ||--|| book_ids
+work_ids -{ book_ids
+
+entity book_isbn <<derived>> {
+    --
+    gr_book_id
+    isbn_id
+    book_code
+}
+
+book_ids <.. book_isbn
+book_ids ||--|{ book_isbn
+book_isbn - isbn_id
+
+entity book_genres <<derived>> {
+    --
+    gr_book_rid
+    gr_book_id
+    genre
+    score
+}
+
+raw_book_genres <.. book_genres
+book_ids ||-|{ book_genres
+
+entity work_title <<derived>> {
+    gr_work_rid
+    --
+    gr_work_id
+    work_itle
+}
+
+raw_work <.. work_title
+work_ids ||--o| work_title
+'work_title |o-|| work_ids
+
+entity work_pub_date <<derived>> {
+    gr_work_rid
+    --
+    gr_work_id
+    pub_year
+    pub_month
+    pub_day
+    pub_date
+}
+
+raw_work <.. work_pub_date
+work_pub_date |o-|| work_ids
+
+entity book_pub_date <<derived>> {
+    gr_book_rid
+    --
+    gr_book_id
+    pub_year
+    pub_month
+    pub_day
+    pub_date
+}
+
+raw_book <.. book_pub_date
+book_pub_date |o-|| book_ids
+
+entity user_ids <<derived>> {
+    gr_user_rid
+    --
+    gr_user_id
+}
+
+raw_interaction <.. user_ids
+
+entity interaction <<derived>> {
+    gr_interaction_rid
+    --
+    gr_book_id
+    gr_user_rid
+    rating
+    is_read
+    date_add
+    date_updated
+}
+
+raw_interaction <.. interaction
+raw_interaction ||--|| interaction
+
+book_ids --{ interaction
+interaction }-- user_ids
+
+entity rating <<derived>> {
+    user_id : gr_user_rid
+    book_id : cluster
+    rating
+    last_rating
+    timestamp
+    nratings
+}
+
+interaction <.. rating
+user_ids }- rating
+
+entity add_action <<derived>> {
+    user_id : gr_user_rid
+    book_id : cluster
+    first_time
+    last_time
+    nactions
+}
+
+interaction <.. add_action
+user_ids }- add_action
+
+entity book_cluster {
+    gr_book_id
+    --
+    cluster
+}
+
+book_ids ||-|| book_cluster
+
+@enduml

File diff suppressed because it is too large
+ 244
- 0
docs/data/goodreads.svg

+ 45
- 0
docs/data/ids.md

@@ -0,0 +1,45 @@
+---
+title: Common Identifiers
+parent: Data Model
+nav_order: 1
+---
+
+# Common Identifiers
+{: .no_toc}
+
+There are two key identifiers that are used across data sets.
+
+1. TOC
+{:toc}
+
+## ISBNs
+
+We use ISBNs for a lot of data linking.  In order to speed up ISBN-based operations, we map textual ISBNs to numeric 'ISBN IDs`.
+
+The `isbn_id` table manages ISBN IDs and their mappings:
+
+| Column  | Purpose         |
+| ------- | --------------- |
+| isbn_id | ISBN identifier |
+| isbn    | Textual ISBNs   |
+
+Each type of ISBN (ISBN-10, ISBN-13) is considered a distinct ISBN. We also consider other ISBN-like things, particularly ASINs, to be ISBNs.
+
+Most derived tables that work with ISBNs use `isbn_id`s.
+
+## Book Codes
+
+We also use *book codes*, common identifiers for integrated 'books' across data sets. These are derived from identifiers in the various data sets, with `bc_of_*` functions.  Each book code source is assigned to a different 10M number band so we can, if needed, derive the source from a book code.
+
+| Source       | Function             | Numspace |
+| ------------ | -------------------- | -------- |
+| OL Work      | `bc_of_work`         | 10M      |
+| OL Edition   | `bc_of_edition`      | 20M      |
+| LOC Record   | `bc_of_loc_rec`      | 30M      |
+| GR Work      | `bc_of_gr_work`      | 40M      |
+| GR Book      | `bc_of_gr_book`      | 50M      |
+| LOC Work     | `bc_of_loc_work`     | 60M      |
+| LOC Instance | `bc_of_loc_instance` | 70M      |
+| ISBN         | `bc_of_isbn`         | 90M      |
+
+The LOC Work and Instance sources are not currently used; they are intended for future use when we are able to import BIBFRAME data from the Library of Congress.

+ 14
- 0
docs/data/index.md

@@ -0,0 +1,14 @@
+---
+title: Data Model
+has_children: true
+nav_order: 3
+---
+
+# Data Model
+
+This section describes the layout of the imported data, and the logic behind its
+integration.
+
+It doesn't describe every intermediate detail or table.
+
+The data is organized into PostgreSQL schemas to make it easier to navigate; one effect of this is that if you just look at the default `public` schema, you will see very few of the tables.  Further, some tables are materialized views, so they may not show up in the table list.  The `\dm` command in `psql` shows materialized views.

+ 100
- 0
docs/data/loc.md

@@ -0,0 +1,100 @@
+---
+title: Library of Congress
+parent: Data Model
+nav_order: 2
+---
+
+# Library of Congress
+{: .no_toc}
+
+One of our sources of book data is the Library of Congress [MDSConnect Books](https://www.loc.gov/cds/products/MDSConnect-books_all.html) bibliography records.
+
+We download and import the XML versions of these files.
+
+Imported data lives under the `locmds` schema.
+
+1. TOC
+{:toc}
+
+## Data Model Diagram
+
+![LOC data model](loc.svg)
+
+- [SVG file](loc.svg)
+- [PlantUML source](loc.puml)
+
+## Import Steps
+
+The import is controlled by the following DVC steps:
+
+`schemas/loc-mds-schema.dvc`
+:   Run `loc-mds-schema.sql` to set up the base schema.
+
+`import/loc-mds-books.dvc`
+:   Import raw MARC data from `data/loc-books/`.
+
+`index/loc-mds-index-books.dvc`
+:   Run `loc-mds-index-books.sql` to index the book data and extract tables.
+
+`index/loc-mds-book-info.dvc`
+:   Run `loc-mds-book-info.sql` to extract additional book data into tables.
+
+## Raw Book Data
+{: #raw}
+
+The `locmds.book_marc_fields` table contains the raw data imported from the MARC files, as MARC fields.  The LOC book data follows the [MARC 21 Bibliographic Data format](https://www.loc.gov/marc/bibliographic/); the various tags, field codes, and indicators are defined there.  This table is not terribly useful on its own, but it is the source from which the other tables are derived.
+
+It has the following columns:
+
+`rec_id`
+:   The record identifier (generated at import)
+
+`fld_no`
+:   The field number.  This corresponds to a single MARC field entry; rows in this table
+    containing data from MARC subfields will share a `fld_no` with their containing field.
+
+`tag`
+:   The MARC tag; either a three-digit number, or `LDR` for the MARC leader.
+
+`ind1`, `ind2`
+:   MARC indicators.  Their meanings are defined in the MARC specification.
+
+`sf_code`
+:   MARC subfield code.
+
+`contents`
+:   The raw textual content of the MARC field or subfield.
+
+## Extracted Book Tables
+
+We then extract a number of tables and views from this MARC data.  These tables include:
+
+`book_record_info`
+:   Code information for each book record.
+
+    - MARC Control Number
+    - Library of Congress Control Number (LCCN)
+    - Record status
+    - Record type
+    - Bibliographic level
+
+    More information about the last three is in the [leader specification](https://www.loc.gov/marc/bibliographic/bdleader.html).
+
+`book`
+:   A subset of `book_record_info` intended to capture the actual books in the collection,
+    as opposed to other types of materials.  We consider a book to be anything that has MARC
+    record type ‘a’ or ‘t’ (language material), and is not also classified as a government
+    record in MARC field 008.
+
+`book_rec_isbn`
+:   Map book records to their ISBNs.
+
+`book_author_name`
+:   Author names for book records.  This only extracts the primary author name (MARC field 100
+    subfield ‘a’).
+
+`book_pub_year`
+:   Book publication year (MARC field 260 subfield ‘c’).
+
+`book_title`
+:   Book title (MARC field 245 subfield ‘a’).

+ 76
- 0
docs/data/loc.puml

@@ -0,0 +1,76 @@
+@startuml
+
+title LOC Book Data
+skinparam linetype ortho
+!include theme.iuml
+
+entity isbn_id {
+  isbn_id
+  --
+  isbn
+}
+
+entity book_marc_record {
+  --
+  rec_id
+  fld_no
+  tag
+  ind1
+  ind2
+  sf_code
+  contents
+}
+
+entity book_record_info <<derived>> {
+  rec_id
+  --
+  marc_cn
+  lccn
+  status
+  rec_type
+  bib_level
+}
+
+entity book {
+}
+
+entity book_rec_isbn {
+  rec_id
+  isbn_id
+}
+
+entity book_author_name <<derived>> {
+  --
+  rec_id
+  name
+}
+
+entity book_pub_year <<derived>> {
+  rec_id
+  --
+  pub_year
+}
+
+entity book_title <<derived>> {
+  rec_id
+  --
+  book_title
+}
+
+book_marc_record <. book_record_info
+
+book_record_info <|- book
+
+book --o{ book_rec_isbn
+book_rec_isbn }o- isbn_id
+
+book_marc_record <.. book_author_name
+book_record_info -o{ book_author_name
+
+book_marc_record <.. book_pub_year
+book_record_info --o| book_pub_year
+
+book_marc_record <.. book_title
+book_record_info --o{ book_title
+
+@enduml

File diff suppressed because it is too large
+ 114
- 0
docs/data/loc.svg

+ 103
- 0
docs/data/openlib.md

@@ -0,0 +1,103 @@
+---
+title: OpenLibrary
+parent: Data Model
+nav_order: 3
+---
+
+# OpenLibrary
+{: .no_toc}
+
+We also source book data from [OpenLibrary](https://openlibrary.org), as downloaded from
+their [developer dumps](https://openlibrary.org/developers/dumps).
+
+The DVC control files automatically download the appropriate version.  The version can be
+updated by modifying the `data/ol_dump_*.txt.gz.dvc` files.
+
+Imported data lives in the `ol` schema.
+
+1. TOC
+{:toc}
+
+## Import Steps
+
+The import is controlled by the following DVC steps:
+
+`schemas/ol-schema.dvc`
+:   Run `ol-schema.sql` to set up the base schema.
+
+`import/ol-works.dvc`
+:   Import raw OpenLibrary works from `data/ol_dump_works.txt.gz`.
+
+`import/ol-editions.dvc`
+:   Import raw OpenLibrary editions from `data/ol_dump_editions.txt.gz`.
+
+`import/ol-authors.dvc`
+:   Import raw OpenLibrary authors from `data/ol_dump_authors.txt.gz`.
+
+`index/ol-index.dvc`
+:   Run `ol-index.sql` to index the book data and extract tables.
+
+`index/ol-book-info.dvc`
+:   Run `ol-book-info.sql` to extract additional book data into tables.
+
+## Raw Data
+
+OpenLibrary provides its data as JSON.  It is imported as-is into a JSONB column in three tables:
+
+- `ol.author`
+- `ol.work`
+- `ol.edition`
+
+Each of these has the following columns:
+
+*type*_id
+:    A numeric record identifier generated at import.
+
+*type*_key
+:    The OpenLibrary identifier key (e.g. `/books/3180A3`).
+
+*type*_data
+:    The raw JSON data containing the record.
+
+We use PostgreSQL's JSON operators and functions to extract the data from these tables for the
+rest of the OpenLibrary data model.
+
+## Extracted Edition Tables
+
+We extract the following tables from OpenLibrary editions:
+
+`edition_author`
+:   Links `edition` and `author` to record an edition's authors.
+
+`edition_first_author`
+:   Links `edition` and `author` to record an edition's first author.
+
+`edition_work`
+:   Links each `edition` to its `work`(s)
+
+`edition_isbn`
+:   The raw ISBNs for each `edition` (*not* ISBN IDs)
+
+`isbn_link`
+:   Link ISBNs, editions, and works, along with the book code derived from an edition's
+    work and edition IDs.  If an edition belongs to multiple works, it will appear multiple
+    times here.  This table violates 4NF.
+
+## Extracted Work Tables
+
+We extract the following tables from OpenLibrary works:
+
+`work_author`
+:   Links `work` and `author` to record an work's authors.
+
+`work_first_author`
+:   Links `work` and `author` to record an work's first author.
+
+`work_subject`
+:   The `subjects` entries for each work.
+
+## Extracted Author Tables
+
+`author_name`
+:   The names for each author.  An author may have more than one listed name; this extracts
+    all of them.

+ 9
- 0
docs/data/theme.iuml

@@ -0,0 +1,9 @@
+skinparam class {
+    BackgroundColor white
+    BackgroundColor<<derived>> #f5f6fa
+    ArrowColor #27262b
+    BorderColor #7253ed
+}
+
+hide empty members
+hide circle

+ 66
- 0
docs/data/viaf.md

@@ -0,0 +1,66 @@
+---
+title: VIAF
+parent: Data Model
+nav_order: 4
+---
+
+# Virtual Internet Authority File
+{: .no_toc}
+
+We source author data from the [Virtual Internet Authority File](http://viaf.org), as downloaded from
+their [data dumps](http://viaf.org/viaf/data).  This file is slow and error-prone to download, and is
+not* auto-downloaded.
+
+Imported data lives in the `viaf` schema.
+
+1. TOC
+{:toc}
+
+## Import Steps
+
+The import is controlled by the following DVC steps:
+
+`schemas/viaf-schema.dvc`
+:   Run `viaf-schema.sql` to set up the base schema.
+
+`import/viaf.dvc`
+:   Import raw VIAF MARC data from `data/viaf-clusters-marc21.xml.gz`.
+
+`index/viaf-index.dvc`
+:   Run `viaf-index.sql` to index the MARC data and extract tables.
+
+## Raw Data
+
+VIAF data is in [MARC 21 Authority Record format](https://www.loc.gov/marc/authority/).  The raw
+MARC data is imported into the `marc_field` table with the [same format as LOC](loc.html#raw).
+
+## Extracted Author Tables
+
+We extract the following tables for VIAF authors:
+
+`author_name`
+:   The author's name(s).  We insert an author name for each field with tag 700 and subfield code ‘a’.
+    For all author names of the form ‘Family, Given’, we insert an additional record with the form
+    ‘Given Family’ and indicator ‘S’.  This helps maximize links.
+
+`author_gender`
+:   The author's gender, from field 375 subfield ‘a’.  This is a raw extract of all gender identity
+    assertions in the record; we resolve multiple assertions later in the data integration process.
+
+## VIAF Gender Vocabulary
+
+The MARC [gender field](https://www.loc.gov/marc/authority/ad375.html) is defined as the author's
+gender *identity*.  It allows identities from an open vocabulary, along with start and end dates
+for the validity of each identity.
+
+The Program for Cooperative Cataloging Task Group on Gender in Name Authority Records produced a
+[report](https://www.loc.gov/aba/pcc/documents/Gender_375%20field_RecommendationReport.pdf) with
+recommendations for how to record this field.  Many libraries contributing to the Library of Congress
+file, from which many VIAF records are sourced, follow these recommendations, but it is not safe
+to assume they are universally followed by all VIAF contributors.
+
+Further, as near as we can tell, the VIAF removes all non-binary gender identities or converts them
+to ‘unknown’.
+
+This data should only be used with great care.  We discuss these limitations in [the extended
+preprint](https://md.ekstrandom.net/pubs/bag-extended).

+ 50
- 0
docs/index.md

@@ -0,0 +1,50 @@
+---
+title: Overview
+---
+
+# Book Data Tools
+
+The PIReT Book Data Tools are a set of tools for ingesting, integrating, and indexing
+a variety of sources of book data, created by the [People and Information Research Team](https://piret.info)
+at [Boise State University](https://boisestate.edu).  The result of running these tools is a PostgreSQL
+database with the raw data, various useful extracted features, and integrated identifiers across the various
+data sources for cross-linking.
+
+If you use these scripts in any published research, cite [our paper](https://md.ekstrandom.net/pubs/book-author-gender):
+
+> Michael D. Ekstrand, Mucun Tian, Mohammed R. Imran Kazi, Hoda Mehrpouyan, and Daniel Kluver. 2018. Exploring Author Gender in Book Rating and Recommendation. In *Proceedings of the 12th ACM Conference on Recommender Systems* (RecSys '18). ACM, pp. 242–250. DOI:[10.1145/3240323.3240373](https://doi.org/10.1145/3240323.3240373). arXiv:[1808.07586v1](https://arxiv.org/abs/1808.07586v1) [cs.IR].
+
+The data integration is described in more detail in our [extended preprint](https://md.ekstrandom.net/pubs/bag-extended).
+
+**Note:** the limitations section of the paper contains important information about
+the limitations of the data these scripts compile.  **Do not use the gender information
+in this data data or tools without understanding those limitations**.  In particular,
+VIAF's gender information is incomplete and, in a number of cases, incorrect.
+
+In addition, several of the data sets integrated by this project come from other sources
+with their own publications.  **If you use any of the rating or interaction data, cite the
+appropriate original source paper.**  For each data set below, we have provided a link to the
+page that describes the data and its appropriate citation.
+
+## License
+
+These tools are under the MIT license:
+
+> Copyright 2019-2020 Boise State University
+>
+> Permission is hereby granted, free of charge, to any person obtaining a copy of
+> this software and associated documentation files (the "Software"), to deal in
+> the Software without restriction, including without limitation the rights to
+> use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+> the Software, and to permit persons to whom the Software is furnished to do so,
+> subject to the following conditions:
+>
+> The above copyright notice and this permission notice shall be included in all
+> copies or substantial portions of the Software.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+> FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+> COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+> IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+> CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 9
- 0
docs/using/index.md

@@ -0,0 +1,9 @@
+---
+title: Importing
+has_children: true
+nav_order: 2
+---
+
+# Using the Tools
+
+This section of the documentation describes how to set up and use the book data integration tools.

+ 23
- 0
docs/using/running.md

@@ -0,0 +1,23 @@
+---
+title: Running
+parent: Importing
+nav_order: 4
+---
+
+# Running the Tools
+
+The data import and integration process is scripted by [DVC](https://dvc.org).  The top-level `Dvcfile` depends on all required steps, so to import the data, just run:
+
+    ./dvc.sh repro
+
+The import process will take approximately 8 hours.
+
+## Custom DVC
+
+Note that the command above uses `./dvc.sh` instead of calling the `dvc` executable directly.  The book
+data tools customize DVC to support checking the status of database import operations, and the `./dvc.sh`
+script runs DVC with the customizations installed.  If you run `dvc`, it will be unable to resolve the
+`pgstat://` URLs and will fail with an error to that effect (the precise error may vary from version to
+version).
+
+`./dvc.sh` is just a wrapper and therefore takes all commands and options applicable to `dvc`.

+ 92
- 0
docs/using/setup.md

@@ -0,0 +1,92 @@
+---
+title: Setup
+parent: Importing
+nav_order: 2
+---
+
+# Setting Up the Environment
+{: .no_toc}
+
+These tools require PostgreSQL and an Anaconda installation.
+
+1. TOC
+{:toc}
+
+## PostgreSQL Database
+
+The book data tools require PostgreSQL (at least version 10), with the following extensions installed:
+
+* [orafce](https://github.com/orafce/orafce)
+* PostgreSQL Contrib (specifically `pg_prewarm` and `uuid-ossp`)
+
+The database will take approximately 500GB.
+
+Create a database for the book data, e.g. `bookdata`, owned by the database user you will be using to run the data integration tools.  The tools will create various tables and schemas.
+
+Once you have created the database, run the following as the database superuser to enable the PostgreSQL extensions:
+
+```sql
+CREATE EXTENSION pg_prewarm;
+CREATE EXTENSION orafce;
+CREATE EXTENSION "uuid-ossp";
+```
+
+## Import Tool Dependencies
+
+The import tools are written in Python and Rust.  The provided `environment.yml` file defines an Anaconda environment (named `bookdata` by default) that contains all required runtimes and libraries:
+
+    conda env create -f environment.yml
+    conda activate bookdata
+
+If you don't want to use Anaconda, see the following for more details on dependencies.
+
+### Python
+
+This needs the following Python dependencies:
+
+- Python 3.6 or later
+- psycopg2
+- numpy
+- tqdm
+- pandas
+- colorama
+- chromalog
+- natural
+- dvc (0.90 or later)
+- sqlparse
+- sqlalchemy
+
+### Rust
+
+The Rust tools need Rust version 1.40 or later.  The easiest way to install this — besides Anaconda — is with
+[rustup](https://www.rust-lang.org/learn/get-started).
+
+The `cargo` build tool will automatically download all Rust libraries required.  The Rust code does not depend on any specific system libraries.
+
+## Database Configuration
+
+All scripts read database configuration from the `DB_URL` environment variable, or alternately
+a config file `db.cfg`.  This file should look like:
+
+```ini
+[DEFAULT]
+host = localhost
+database = bookdata
+```
+
+This file additionally supports branch-specfic configuration sections that will apply to work
+on different Git branches, e.g.:
+
+```ini
+[DEFAULT]
+host = localhost
+database = bookdata
+
+[master]
+database = bdorig
+```
+
+This setup will use `bookdata` for most branches, but will connect to `bdorig` when working
+from the `master` branch in the git repository.
+
+This file should **not** be committed to Git.  It is ignored in `.gitignore`.

+ 29
- 0
docs/using/sources.md

@@ -0,0 +1,29 @@
+---
+title: Source Data
+parent: Importing
+nav_order: 3
+---
+
+# Downloading Source Data
+
+These import tools will integrate several data sets. Some of them are auto-downloaded, but others you will
+need to download yourself and save in the `data` directory.  The data sources are:
+
+-   [Library of Congress MDSConnect Open MARC Records](https://www.loc.gov/cds/products/MDSConnect-books_all.html) (auto-downloaded).
+-   [LoC MDSConnect Name Authorities](https://www.loc.gov/cds/products/MDSConnect-name_authorities.html) (auto-downloaded).
+-   [Virtual Internet Authority File](http://viaf.org/viaf/data/) MARC 21 XML data (**not** auto-downloaded).
+-   [OpenLibrary Dump](https://openlibrary.org/developers/dumps) (auto-downloaded).
+-   [Amazon Ratings](http://jmcauley.ucsd.edu/data/amazon/) 'ratings only' data for _Books_ (**not** auto-downloaded — save CSV file in `data`).  **If you use this data, cite the paper on that site.**
+-   [BookCrossing](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) (auto-downloaded). **If you use this data, cite the paper on that site.**
+-   GoodReads data from [UCSD Book Graph](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home) — the GoodReads books, works, authors, and *full interaction* files (**not** auto-downloaded - save GZip'd JSON files in `data`).  **If you use this data, cite the paper on that site.**
+
+If all files are properly downloaded, `./dvc.sh status data/*.dvc` will show that all files are up to date, except for `loc-listings.dvc` which is 'always changed' (it may also display warnings about locked files).
+
+```
+$ ./dvc.sh status data/*.dvc
+data/loc-listings.dvc:
+        always changed
+$
+```
+
+See [Data Model](../data/) for details on how each data source appears in the final data.

+ 1
- 0
index/.gitignore

@@ -8,3 +8,4 @@
 /bx-index.transcript
 /loc-mds-book-info.transcript
 /ol-book-info.transcript
+/gr-book-info.transcript

+ 22
- 0
index/gr-book-info.dvc

@@ -0,0 +1,22 @@
+md5: d4923155bb426d56c156c0248251f296
+cmd: python ../run.py sql-script gr-book-info.sql
+deps:
+- path: gr-book-info.sql
+  md5: 88f4e6e1394ff7c42096f01c2e5adc64
+- path: pgstat://gr-books
+  md5: f5ed1a405492061fa4064874f7ad9b2e
+- path: pgstat://gr-works
+  md5: 6414c0814bddda4e2c9e85ff4dd5f61c
+- path: pgstat://gr-index-books
+  md5: adb828e6f555bfc2b4f2d84c1a3afb0c
+outs:
+- path: pgstat://gr-book-info
+  cache: false
+  md5: f2f75c4bf2dea889664ebb4a285a957a
+  metric: false
+  persist: false
+- path: gr-book-info.transcript
+  cache: true
+  metric: false
+  persist: false
+  md5: d50e2a5923842c4a66e25d2261a97920

+ 61
- 0
index/gr-book-info.sql

@@ -0,0 +1,61 @@
+--- #dep gr-books
+--- #dep gr-works
+--- #dep gr-index-books
+--- #table gr.work_title
+--- #table gr.book_pub_date
+
+--- #step Create useful GR functions
+CREATE OR REPLACE FUNCTION try_date(year VARCHAR, month VARCHAR, day VARCHAR) RETURNS DATE
+IMMUTABLE RETURNS NULL ON NULL INPUT PARALLEL UNSAFE
+    LANGUAGE plpgsql AS $$
+    BEGIN
+        RETURN MAKE_DATE(NULLIF(year, '')::INTEGER,
+                    NULLIF(month, '')::INTEGER,
+                    NULLIF(day, '')::INTEGER);
+    EXCEPTION WHEN SQLSTATE '22008' THEN
+        RETURN NULL;
+    END;
+    $$;
+
+--- #step Extract GoodReads work titles
+DROP MATERIALIZED VIEW IF EXISTS gr.work_title;
+CREATE MATERIALIZED VIEW gr.work_title
+AS SELECT gr_work_rid, (gr_work_data->>'work_id')::int AS gr_work_id,
+  NULLIF(gr_work_data->>'original_title', '') AS work_title
+FROM gr.raw_work;
+CREATE INDEX gr_work_title_work_idx ON gr.work_title (gr_work_id);
+ANALYZE gr.work_title;
+
+--- #step Extract GoodReads book publication dates
+DROP MATERIALIZED VIEW IF EXISTS gr.book_pub_date;
+CREATE MATERIALIZED VIEW gr.book_pub_date
+AS SELECT gr_book_rid, book_id AS gr_book_id,
+          NULLIF(publication_year, '')::INTEGER AS pub_year,
+          NULLIF(publication_month, '')::INTEGER AS pub_month,
+          NULLIF(publication_day, '')::INTEGER AS pub_day,
+          try_date(publication_year, publication_month, publication_day) AS pub_date
+   FROM gr.raw_book,
+        jsonb_to_record(gr_book_data) AS
+            x(book_id INTEGER, publication_year VARCHAR,
+              publication_month VARCHAR, publication_day VARCHAR)
+   WHERE NULLIF(publication_year, '') IS NOT NULL;
+CREATE UNIQUE INDEX gr_bpd_rec_idx ON gr.book_pub_date (gr_book_rid);
+CREATE UNIQUE INDEX gr_bpd_book_idx ON gr.book_pub_date (gr_book_id);
+ANALYZE gr.book_pub_date;
+
+--- #step Extract GoodReads work original publication dates
+DROP MATERIALIZED VIEW IF EXISTS gr.work_pub_date;
+CREATE MATERIALIZED VIEW gr.work_pub_date
+AS SELECT gr_work_rid, work_id AS gr_work_id,
+          NULLIF(original_publication_year, '')::INTEGER AS pub_year,
+          NULLIF(original_publication_month, '')::INTEGER AS pub_month,
+          NULLIF(original_publication_day, '')::INTEGER AS pub_day,
+          try_date(original_publication_year, original_publication_month, original_publication_day) AS pub_date
+   FROM gr.raw_work,
+        jsonb_to_record(gr_work_data) AS
+            x(work_id INTEGER, original_publication_year VARCHAR,
+              original_publication_month VARCHAR, original_publication_day VARCHAR)
+   WHERE NULLIF(original_publication_year, '') IS NOT NULL;
+CREATE UNIQUE INDEX gr_wpd_rec_idx ON gr.work_pub_date (gr_work_rid);
+CREATE UNIQUE INDEX gr_wpd_work_idx ON gr.work_pub_date (gr_work_id);
+ANALYZE gr.work_pub_date;

+ 2
- 2
index/gr-index-books.dvc

@@ -1,8 +1,8 @@
-md5: eb1aab0498209f394a0f5f0bceffd647
+md5: 0ec39924db98089bc27b49064b28242c
 cmd: python ../run.py sql-script gr-index-books.sql
 deps:
 - path: gr-index-books.sql
-  md5: e935200eeaa893d2112427c06e89ca6e
+  md5: bf2560c7da69f6fc8d124a98f4f5a0ed
 - path: pgstat://gr-books
   md5: f5ed1a405492061fa4064874f7ad9b2e
 - path: pgstat://gr-works

+ 0
- 10
index/gr-index-books.sql

@@ -1,10 +1,8 @@
 --- #dep gr-books
 --- #dep gr-works
 --- #dep gr-authors
---- #dep gr-book-genres
 --- #table gr.work_ids
 --- #table gr.book_ids
---- #table gr.book_genres
 --- #step Add book PK
 --- #allow invalid_table_definition
 ALTER TABLE gr.raw_book ADD CONSTRAINT gr_raw_book_pk PRIMARY KEY (gr_book_rid);
@@ -94,11 +92,3 @@ CREATE TABLE IF NOT EXISTS gr.book_genres
      WHERE gr_book_id = (gr_book_genres_data->>'book_id')::int;
 CREATE INDEX bg_book_rid ON gr.book_genres (gr_book_rid);
 CREATE INDEX bg_book_id ON gr.book_genres (gr_book_id);
-
---- #step Extract GoodReads book titles
-DROP MATERIALIZED VIEW IF EXISTS gr.work_titles;
-CREATE MATERIALIZED VIEW gr.work_titles
-AS SELECT gr_work_rid, (gr_work_data->>'work_id')::int AS gr_work_id,
-  NULLIF(gr_work_data->>'original_title', '') AS work_title
-FROM gr.raw_work;
-CREATE INDEX gr_work_title_work_idx ON gr.work_titles (gr_work_id);

+ 2
- 2
index/loc-mds-book-info.dvc

@@ -1,8 +1,8 @@
-md5: 40c5f68ff5c321936acf34e518bcca8f
+md5: e97d32214a51bdad4c23f7f678ba0c32
 cmd: python ../run.py sql-script loc-mds-book-info.sql
 deps:
 - path: loc-mds-book-info.sql
-  md5: fce49f005e1450890452881a6e71ff40
+  md5: c13381fbb505c7e778032685452c1f23
 - path: pgstat://loc-mds-index-books
   md5: d638b9a4a77a99749f2214e6400d85a1
 outs:

+ 21
- 0
index/loc-mds-book-info.sql

@@ -1,6 +1,27 @@
 --- #dep loc-mds-index-books
 -- Extract more book information
 
+--- #step Extract authors
+CREATE MATERIALIZED VIEW IF NOT EXISTS locmds.book_author_name
+  AS SELECT rec_id, regexp_replace(contents, '\W+$', '') AS name
+  FROM locmds.book_marc_field
+  WHERE tag = '100' AND sf_code = 'a'
+  WITH NO DATA;
+REFRESH MATERIALIZED VIEW locmds.book_author_name;
+CREATE INDEX IF NOT EXISTS book_author_name_rec_idx ON locmds.book_author_name (rec_id);
+CREATE INDEX IF NOT EXISTS book_author_name_name_idx ON locmds.book_author_name (name);
+ANALYZE locmds.book_author_name;
+
+--- #step Extract publication years
+CREATE MATERIALIZED VIEW IF NOT EXISTS locmds.book_pub_year
+  AS SELECT rec_id, substring(contents from '(\d\d\d\d)') AS pub_year
+  FROM locmds.book_marc_field
+  WHERE tag = '260' AND sf_code = 'c' AND substring(contents from '(\d\d\d\d)') IS NOT NULL
+  WITH NO DATA;
+REFRESH MATERIALIZED VIEW locmds.book_pub_year;
+CREATE INDEX IF NOT EXISTS book_pub_year_rec_idx ON locmds.book_pub_year (rec_id);
+ANALYZE locmds.book_pub_year;
+
 --- #step Extract book titles
 DROP MATERIALIZED VIEW IF EXISTS locmds.book_title;
 CREATE MATERIALIZED VIEW locmds.book_title

+ 2
- 2
index/loc-mds-index-books.dvc

@@ -1,8 +1,8 @@
-md5: 102c2a7c093c19fd709bb86f22b67005
+md5: aebb364d363e41f09f68258b2d98b75b
 cmd: python ../run.py sql-script loc-mds-index-books.sql
 deps:
 - path: loc-mds-index-books.sql
-  md5: 33201b025015702a146d759b6785b633
+  md5: 4388f828b7802baf1cc17d0849babd72
 - path: pgstat://loc-mds-books
   md5: 0de2d856ffd1f5d08b2a2fa55b8c9098
 outs:

+ 0
- 21
index/loc-mds-index-books.sql

@@ -100,24 +100,3 @@ REFRESH MATERIALIZED VIEW locmds.book_rec_isbn;
 CREATE INDEX IF NOT EXISTS book_rec_isbn_rec_idx ON locmds.book_rec_isbn (rec_id);
 CREATE INDEX IF NOT EXISTS book_rec_isbn_isbn_idx ON locmds.book_rec_isbn (isbn_id);
 ANALYZE locmds.book_rec_isbn;
-
---- #step Extract authors
-CREATE MATERIALIZED VIEW IF NOT EXISTS locmds.book_author_name
-  AS SELECT rec_id, regexp_replace(contents, '\W+$', '') AS name
-  FROM locmds.book_marc_field
-  WHERE tag = '100' AND sf_code = 'a'
-  WITH NO DATA;
-REFRESH MATERIALIZED VIEW locmds.book_author_name;
-CREATE INDEX IF NOT EXISTS book_author_name_rec_idx ON locmds.book_author_name (rec_id);
-CREATE INDEX IF NOT EXISTS book_author_name_name_idx ON locmds.book_author_name (name);
-ANALYZE locmds.book_author_name;
-
---= #step Extract publication years
-CREATE MATERIALIZED VIEW IF NOT EXISTS locmds.book_pub_year
-  AS SELECT rec_id, substring(contents from '(\d\d\d\d)') AS pub_year
-  FROM locmds.book_marc_field
-  WHERE tag = '260' AND sf_code = 'c' AND substring(contents from '(\d\d\d\d)') IS NOT NULL
-  WITH NO DATA;
-REFRESH MATERIALIZED VIEW locmds.book_pub_year;
-CREATE INDEX IF NOT EXISTS book_pub_year_rec_idx ON locmds.book_pub_year (rec_id);
-ANALYZE locmds.book_pub_year;

+ 2
- 2
index/ol-index.dvc

@@ -1,7 +1,7 @@
-md5: 56e276c0040a89e653f7841bd462232a
+md5: 995e35e089c368027528ad202fe2f7a1
 cmd: python ../run.py sql-script ol-index.sql
 deps:
-- md5: ede73df32009dbbdaa25c6a7bf722491
+- md5: 053859a2a15f4d01787476bf784134f9
   path: ol-index.sql
 - path: pgstat://ol-authors
   md5: 82a67602a52fe74c29209ceb25e35686

+ 0
- 17
index/ol-index.sql

@@ -101,23 +101,6 @@ ALTER TABLE ol.edition_work ADD CONSTRAINT edition_work_ed_fk FOREIGN KEY (editi
 --- #allow duplicate_object
 ALTER TABLE ol.edition_work ADD CONSTRAINT edition_work_wk_fk FOREIGN KEY (work_id) REFERENCES ol.work;
 
---- #step Extract ISBNs and ASINs
-CREATE MATERIALIZED VIEW IF NOT EXISTS ol.edition_isbn10
-  AS SELECT edition_id, jsonb_array_elements_text(edition_data->'isbn_10') AS isbn
-     FROM ol.edition;
-CREATE MATERIALIZED VIEW IF NOT EXISTS ol.edition_isbn13
-  AS SELECT edition_id, jsonb_array_elements_text(edition_data->'isbn_13') AS isbn
-     FROM ol.edition;
-CREATE MATERIALIZED VIEW IF NOT EXISTS ol.edition_asin
-  AS SELECT edition_id, jsonb_array_elements_text(edition_data#>'{identifiers,amazon}') AS asin
-     FROM ol.edition;
-CREATE MATERIALIZED VIEW IF NOT EXISTS ol.edition_lccn
-  AS SELECT edition_id, jsonb_array_elements_text(edition_data->'lccn') AS lccn
-     FROM ol.edition;
-CREATE MATERIALIZED VIEW IF NOT EXISTS ol.edition_gr_bid
-  AS SELECT edition_id, jsonb_array_elements_text(edition_data#>'{identifiers,goodreads}') AS gr_book_rid
-     FROM ol.edition;
-
 --- #step Integrate ISBN/ASIN identifiers
 DROP TABLE IF EXISTS ol.edition_isbn CASCADE;
 CREATE TABLE ol.edition_isbn (

+ 3
- 1
integrate/author-info.dvc

@@ -1,8 +1,10 @@
-md5: 1480e59d2b372599bc362d94e7a66a2f
+md5: 84aff4ccd0c4b8ca29f83e54c28ebc52
 cmd: python ../run.py sql-script author-info.sql
 deps:
 - path: pgstat://loc-mds-index-books
   md5: d638b9a4a77a99749f2214e6400d85a1
+- path: pgstat://loc-mds-book-info
+  md5: be165ab9a7023147ad3f91248839562f
 - path: pgstat://viaf-index
   md5: 93814ea8630c4e4e9a3ce388a990c2c8
 - path: pgstat://cluster