Browse Source

Add documentation

Michael Ekstrand 8 months ago
parent
commit
d07e9758d5
10 changed files with 471 additions and 1 deletions
  1. 1
    1
      README.md
  2. 1
    0
      docs/.gitignore
  3. 6
    0
      docs/Gemfile
  4. 253
    0
      docs/Gemfile.lock
  5. 8
    0
      docs/_config.yml
  6. 50
    0
      docs/index.md
  7. 8
    0
      docs/using/index.md
  8. 23
    0
      docs/using/running.md
  9. 92
    0
      docs/using/setup.md
  10. 29
    0
      docs/using/sources.md

+ 1
- 1
README.md

@@ -2,7 +2,7 @@ This repository contains the code to import and integrate the book and rating da
 It imports and integrates data from several sources in a single PostgreSQL database; import scripts
 are primarily in Python, with Rust code for high-throughput processing of raw data files.
 
-If you use these scripts in any published reseaerch, cite [our paper](https://md.ekstrandom.net/pubs/book-author-gender):
+If you use these scripts in any published research, cite [our paper](https://md.ekstrandom.net/pubs/book-author-gender):
 
 > Michael D. Ekstrand, Mucun Tian, Mohammed R. Imran Kazi, Hoda Mehrpouyan, and Daniel Kluver. 2018. Exploring Author Gender in Book Rating and Recommendation. In *Proceedings of the 12th ACM Conference on Recommender Systems* (RecSys '18). ACM, pp. 242–250. DOI:[10.1145/3240323.3240373](https://doi.org/10.1145/3240323.3240373). arXiv:[1808.07586v1](https://arxiv.org/abs/1808.07586v1) [cs.IR].
 

+ 1
- 0
docs/.gitignore

@@ -0,0 +1 @@
+/_site

+ 6
- 0
docs/Gemfile

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+
+gem "github-pages", group: :jekyll_plugins
+gem "jekyll-include-cache", group: :jekyll_plugins
+
+gem 'wdm', '>= 0.1.0' if Gem.win_platform?

+ 253
- 0
docs/Gemfile.lock

@@ -0,0 +1,253 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (6.0.2.2)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 0.7, < 2)
+      minitest (~> 5.1)
+      tzinfo (~> 1.1)
+      zeitwerk (~> 2.2)
+    addressable (2.7.0)
+      public_suffix (>= 2.0.2, < 5.0)
+    coffee-script (2.4.1)
+      coffee-script-source
+      execjs
+    coffee-script-source (1.11.1)
+    colorator (1.1.0)
+    commonmarker (0.17.13)
+      ruby-enum (~> 0.5)
+    concurrent-ruby (1.1.6)
+    dnsruby (1.61.3)
+      addressable (~> 2.5)
+    em-websocket (0.5.1)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0.6.0)
+    ethon (0.12.0)
+      ffi (>= 1.3.0)
+    eventmachine (1.2.7-x64-mingw32)
+    execjs (2.7.0)
+    faraday (1.0.0)
+      multipart-post (>= 1.2, < 3)
+    ffi (1.12.2-x64-mingw32)
+    forwardable-extended (2.6.0)
+    gemoji (3.0.1)
+    github-pages (204)
+      github-pages-health-check (= 1.16.1)
+      jekyll (= 3.8.5)
+      jekyll-avatar (= 0.7.0)
+      jekyll-coffeescript (= 1.1.1)
+      jekyll-commonmark-ghpages (= 0.1.6)
+      jekyll-default-layout (= 0.1.4)
+      jekyll-feed (= 0.13.0)
+      jekyll-gist (= 1.5.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-mentions (= 1.5.1)
+      jekyll-optional-front-matter (= 0.3.2)
+      jekyll-paginate (= 1.1.0)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.15.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.1)
+      jekyll-sass-converter (= 1.5.2)
+      jekyll-seo-tag (= 2.6.1)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.1.1)
+      jekyll-theme-cayman (= 0.1.1)
+      jekyll-theme-dinky (= 0.1.1)
+      jekyll-theme-hacker (= 0.1.1)
+      jekyll-theme-leap-day (= 0.1.1)
+      jekyll-theme-merlot (= 0.1.1)
+      jekyll-theme-midnight (= 0.1.1)
+      jekyll-theme-minimal (= 0.1.1)
+      jekyll-theme-modernist (= 0.1.1)
+      jekyll-theme-primer (= 0.5.4)
+      jekyll-theme-slate (= 0.1.1)
+      jekyll-theme-tactile (= 0.1.1)
+      jekyll-theme-time-machine (= 0.1.1)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.11.1)
+      kramdown (= 1.17.0)
+      liquid (= 4.0.3)
+      mercenary (~> 0.3)
+      minima (= 2.5.1)
+      nokogiri (>= 1.10.4, < 2.0)
+      rouge (= 3.13.0)
+      terminal-table (~> 1.4)
+    github-pages-health-check (1.16.1)
+      addressable (~> 2.3)
+      dnsruby (~> 1.60)
+      octokit (~> 4.0)
+      public_suffix (~> 3.0)
+      typhoeus (~> 1.3)
+    html-pipeline (2.12.3)
+      activesupport (>= 2)
+      nokogiri (>= 1.4)
+    http_parser.rb (0.6.0)
+    i18n (0.9.5)
+      concurrent-ruby (~> 1.0)
+    jekyll (3.8.5)
+      addressable (~> 2.4)
+      colorator (~> 1.0)
+      em-websocket (~> 0.5)
+      i18n (~> 0.7)
+      jekyll-sass-converter (~> 1.0)
+      jekyll-watch (~> 2.0)
+      kramdown (~> 1.14)
+      liquid (~> 4.0)
+      mercenary (~> 0.3.3)
+      pathutil (~> 0.9)
+      rouge (>= 1.7, < 4)
+      safe_yaml (~> 1.0)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-coffeescript (1.1.1)
+      coffee-script (~> 2.2)
+      coffee-script-source (~> 1.11.1)
+    jekyll-commonmark (1.3.1)
+      commonmarker (~> 0.14)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-commonmark-ghpages (0.1.6)
+      commonmarker (~> 0.17.6)
+      jekyll-commonmark (~> 1.2)
+      rouge (>= 2.0, < 4.0)
+    jekyll-default-layout (0.1.4)
+      jekyll (~> 3.0)
+    jekyll-feed (0.13.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-gist (1.5.0)
+      octokit (~> 4.2)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
+      octokit (~> 4.0, != 4.4.0)
+    jekyll-include-cache (0.2.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-mentions (1.5.1)
+      html-pipeline (~> 2.3)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-paginate (1.1.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.15.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.1)
+      addressable (~> 2.0)
+      jekyll (>= 3.5, < 5.0)
+      rubyzip (>= 1.3.0)
+    jekyll-sass-converter (1.5.2)
+      sass (~> 3.4)
+    jekyll-seo-tag (2.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-cayman (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-dinky (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-hacker (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-leap-day (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-merlot (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-midnight (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-minimal (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-modernist (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-primer (0.5.4)
+      jekyll (> 3.5, < 5.0)
+      jekyll-github-metadata (~> 2.9)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-slate (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-tactile (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-time-machine (0.1.1)
+      jekyll (~> 3.5)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    jemoji (0.11.1)
+      gemoji (~> 3.0)
+      html-pipeline (~> 2.2)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (1.17.0)
+    liquid (4.0.3)
+    listen (3.2.1)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
+    mercenary (0.3.6)
+    mini_portile2 (2.4.0)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-feed (~> 0.9)
+      jekyll-seo-tag (~> 2.1)
+    minitest (5.14.0)
+    multipart-post (2.1.1)
+    nokogiri (1.10.9-x64-mingw32)
+      mini_portile2 (~> 2.4.0)
+    octokit (4.18.0)
+      faraday (>= 0.9)
+      sawyer (~> 0.8.0, >= 0.5.3)
+    pathutil (0.16.2)
+      forwardable-extended (~> 2.6)
+    public_suffix (3.1.1)
+    rb-fsevent (0.10.3)
+    rb-inotify (0.10.1)
+      ffi (~> 1.0)
+    rouge (3.13.0)
+    ruby-enum (0.7.2)
+      i18n
+    rubyzip (2.3.0)
+    safe_yaml (1.0.5)
+    sass (3.7.4)
+      sass-listen (~> 4.0.0)
+    sass-listen (4.0.0)
+      rb-fsevent (~> 0.9, >= 0.9.4)
+      rb-inotify (~> 0.9, >= 0.9.7)
+    sawyer (0.8.2)
+      addressable (>= 2.3.5)
+      faraday (> 0.8, < 2.0)
+    terminal-table (1.8.0)
+      unicode-display_width (~> 1.1, >= 1.1.1)
+    thread_safe (0.3.6)
+    typhoeus (1.3.1)
+      ethon (>= 0.9.0)
+    tzinfo (1.2.6)
+      thread_safe (~> 0.1)
+    unicode-display_width (1.7.0)
+    wdm (0.1.1)
+    zeitwerk (2.3.0)
+
+PLATFORMS
+  x64-mingw32
+
+DEPENDENCIES
+  github-pages
+  jekyll-include-cache
+  wdm (>= 0.1.0)
+
+BUNDLED WITH
+   1.17.2

+ 8
- 0
docs/_config.yml

@@ -0,0 +1,8 @@
+title: Book Data Tools
+name: Michael Ekstrand
+
+remote_theme: pmarsceill/just-the-docs
+
+aux_links:
+  "GitHub": https://github.com/BoiseState/bookdata-tools
+  "PIReT": https://piret.info

+ 50
- 0
docs/index.md

@@ -0,0 +1,50 @@
+---
+title: Overview
+---
+
+# Book Data Tools
+
+The PIReT Book Data Tools are a set of tools for ingesting, integrating, and indexing
+a variety of sources of book data, created by the [People and Information Research Team](https://piret.info)
+at [Boise State University](https://boisestate.edu).  The result of running these tools is a PostgreSQL
+database with the raw data, various useful extracted features, and integrated identifiers across the various
+data sources for cross-linking.
+
+If you use these scripts in any published research, cite [our paper](https://md.ekstrandom.net/pubs/book-author-gender):
+
+> Michael D. Ekstrand, Mucun Tian, Mohammed R. Imran Kazi, Hoda Mehrpouyan, and Daniel Kluver. 2018. Exploring Author Gender in Book Rating and Recommendation. In *Proceedings of the 12th ACM Conference on Recommender Systems* (RecSys '18). ACM, pp. 242–250. DOI:[10.1145/3240323.3240373](https://doi.org/10.1145/3240323.3240373). arXiv:[1808.07586v1](https://arxiv.org/abs/1808.07586v1) [cs.IR].
+
+The data integration is described in more detail in our [extended preprint](https://md.ekstrandom.net/pubs/bag-extended).
+
+**Note:** the limitations section of the paper contains important information about
+the limitations of the data these scripts compile.  **Do not use the gender information
+in this data data or tools without understanding those limitations**.  In particular,
+VIAF's gender information is incomplete and, in a number of cases, incorrect.
+
+In addition, several of the data sets integrated by this project come from other sources
+with their own publications.  **If you use any of the rating or interaction data, cite the
+appropriate original source paper.**  For each data set below, we have provided a link to the
+page that describes the data and its appropriate citation.
+
+## License
+
+These tools are under the MIT license:
+
+> Copyright 2019-2020 Boise State University
+>
+> Permission is hereby granted, free of charge, to any person obtaining a copy of
+> this software and associated documentation files (the "Software"), to deal in
+> the Software without restriction, including without limitation the rights to
+> use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+> the Software, and to permit persons to whom the Software is furnished to do so,
+> subject to the following conditions:
+>
+> The above copyright notice and this permission notice shall be included in all
+> copies or substantial portions of the Software.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+> FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+> COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+> IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+> CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

+ 8
- 0
docs/using/index.md

@@ -0,0 +1,8 @@
+---
+title: Usage
+has_children: true
+---
+
+# Using the Tools
+
+This section of the documentation describes how to set up and use the book data integration tools.

+ 23
- 0
docs/using/running.md

@@ -0,0 +1,23 @@
+---
+title: Running
+parent: Usage
+nav_order: 4
+---
+
+# Running the Tools
+
+The data import and integration process is scripted by [DVC](https://dvc.org).  The top-level `Dvcfile` depends on all required steps, so to import the data, just run:
+
+    ./dvc.sh repro
+
+The import process will take approximately 8 hours.
+
+## Custom DVC
+
+Note that the command above uses `./dvc.sh` instead of calling the `dvc` executable directly.  The book
+data tools customize DVC to support checking the status of database import operations, and the `./dvc.sh`
+script runs DVC with the customizations installed.  If you run `dvc`, it will be unable to resolve the
+`pgstat://` URLs and will fail with an error to that effect (the precise error may vary from version to
+version).
+
+`./dvc.sh` is just a wrapper and therefore takes all commands and options applicable to `dvc`.

+ 92
- 0
docs/using/setup.md

@@ -0,0 +1,92 @@
+---
+title: Setup
+parent: Usage
+nav_order: 2
+---
+
+# Setting Up the Environment
+{: .no_toc}
+
+These tools require PostgreSQL and an Anaconda installation.
+
+1. TOC
+{:toc}
+
+## PostgreSQL Database
+
+The book data tools require PostgreSQL (at least version 10), with the following extensions installed:
+
+* [orafce](https://github.com/orafce/orafce)
+* PostgreSQL Contrib (specifically `pg_prewarm` and `uuid-ossp`)
+
+The database will take approximately 500GB.
+
+Create a database for the book data, e.g. `bookdata`, owned by the database user you will be using to run the data integration tools.  The tools will create various tables and schemas.
+
+Once you have created the database, run the following as the database superuser to enable the PostgreSQL extensions:
+
+```sql
+CREATE EXTENSION pg_prewarm;
+CREATE EXTENSION orafce;
+CREATE EXTENSION "uuid-ossp";
+```
+
+## Import Tool Dependencies
+
+The import tools are written in Python and Rust.  The provided `environment.yml` file defines an Anaconda environment (named `bookdata` by default) that contains all required runtimes and libraries:
+
+    conda env create -f environment.yml
+    conda activate bookdata
+
+If you don't want to use Anaconda, see the following for more details on dependencies.
+
+### Python
+
+This needs the following Python dependencies:
+
+- Python 3.6 or later
+- psycopg2
+- numpy
+- tqdm
+- pandas
+- colorama
+- chromalog
+- natural
+- dvc (0.90 or later)
+- sqlparse
+- sqlalchemy
+
+### Rust
+
+The Rust tools need Rust version 1.40 or later.  The easiest way to install this — besides Anaconda — is with
+[rustup](https://www.rust-lang.org/learn/get-started).
+
+The `cargo` build tool will automatically download all Rust libraries required.  The Rust code does not depend on any specific system libraries.
+
+## Database Configuration
+
+All scripts read database configuration from the `DB_URL` environment variable, or alternately
+a config file `db.cfg`.  This file should look like:
+
+```ini
+[DEFAULT]
+host = localhost
+database = bookdata
+```
+
+This file additionally supports branch-specfic configuration sections that will apply to work
+on different Git branches, e.g.:
+
+```ini
+[DEFAULT]
+host = localhost
+database = bookdata
+
+[master]
+database = bdorig
+```
+
+This setup will use `bookdata` for most branches, but will connect to `bdorig` when working
+from the `master` branch in the git repository.
+
+This file should **not** be committed to Git.  It is ignored in `.gitignore`.

+ 29
- 0
docs/using/sources.md

@@ -0,0 +1,29 @@
+---
+title: Source Data
+parent: Usage
+nav_order: 3
+---
+
+# Downloading Source Data
+
+These import tools will integrate several data sets. Some of them are auto-downloaded, but others you will
+need to download yourself and save in the `data` directory.  The data sources are:
+
+-   [Library of Congress MDSConnect Open MARC Records](https://www.loc.gov/cds/products/MDSConnect-books_all.html) (auto-downloaded).
+-   [LoC MDSConnect Name Authorities](https://www.loc.gov/cds/products/MDSConnect-name_authorities.html) (auto-downloaded).
+-   [Virtual Internet Authority File](http://viaf.org/viaf/data/) MARC 21 XML data (**not** auto-downloaded).
+-   [OpenLibrary Dump](https://openlibrary.org/developers/dumps) (auto-downloaded).
+-   [Amazon Ratings](http://jmcauley.ucsd.edu/data/amazon/) 'ratings only' data for _Books_ (**not** auto-downloaded — save CSV file in `data`).  **If you use this data, cite the paper on that site.**
+-   [BookCrossing](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) (auto-downloaded). **If you use this data, cite the paper on that site.**
+-   GoodReads data from [UCSD Book Graph](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home) — the GoodReads books, works, authors, and *full interaction* files (**not** auto-downloaded - save GZip'd JSON files in `data`).  **If you use this data, cite the paper on that site.**
+
+If all files are properly downloaded, `./dvc.sh status data/*.dvc` will show that all files are up to date, except for `loc-listings.dvc` which is 'always changed' (it may also display warnings about locked files).
+
+```
+$ ./dvc.sh status data/*.dvc
+data/loc-listings.dvc:
+        always changed
+$
+```
+
+See [Data Model](../data/) for details on how each data source appears in the final data.