Browse Source

Re-run LOC MDS extract

Michael Ekstrand 8 months ago
parent
commit
41d4bc233c

+ 1
- 0
.gitignore

@@ -19,3 +19,4 @@ db.cfg
 *.gml
 *.graphml
 *.txt
+/loc-mds-extract-isbns.transcript

+ 3
- 3
import/loc-mds-books.dvc

@@ -1,4 +1,4 @@
-md5: badee6a37e5ed669c4b08aa57e511bc9
+md5: ed37f6ac8cd504e6a01b1b7f72de9840
 cmd: python run.py --rust parse-marc --db-schema locmds -t book_marc_field --truncate
   --stage loc-mds-books -D loc-mds-schema --transcript import/loc-mds-books.transcript
   --src-dir data/loc-books --src-prefix BooksAll.2016
@@ -6,12 +6,12 @@ wdir: ..
 deps:
 - md5: 1b1e7ab1d98cc81e373dfc53345d4bb7.dir
   path: data/loc-books
-- md5: aec051062654d5f98dc0ccb72a0ababa
+- md5: e63399b7692987ecd6b579066e5bd35e
   path: pgstat://loc-mds-schema
 outs:
 - path: pgstat://loc-mds-books
   cache: false
-  md5: 0de2d856ffd1f5d08b2a2fa55b8c9098
+  md5: f6e0026b4d4fe4bac7056c7fe0491259
   metric: false
   persist: false
 - md5: 19b0a011c9053361278a3d812518b229

+ 2
- 2
index/loc-mds-book-info.dvc

@@ -1,8 +1,8 @@
-md5: e97d32214a51bdad4c23f7f678ba0c32
+md5: 40c5f68ff5c321936acf34e518bcca8f
 cmd: python ../run.py sql-script loc-mds-book-info.sql
 deps:
 - path: loc-mds-book-info.sql
-  md5: c13381fbb505c7e778032685452c1f23
+  md5: fce49f005e1450890452881a6e71ff40
 - path: pgstat://loc-mds-index-books
   md5: d638b9a4a77a99749f2214e6400d85a1
 outs:

+ 18
- 0
index/loc-mds-extract-isbns.dvc

@@ -0,0 +1,18 @@
+cmd: python run.py --rust parse-isbns --src-table locmds.book_raw_isbn --out-table
+  locmds.book_extracted_isbn --stage loc-mds-extract-isbns -D loc-mds-books -T loc-mds-extract-isbns.transcript
+wdir: ..
+deps:
+- path: pgstat://loc-mds-books
+  md5: f6e0026b4d4fe4bac7056c7fe0491259
+outs:
+- path: pgstat://loc-mds-extract-isbns
+  cache: false
+  md5: c4ceff988a5b8a7c15ca00c0dbc4ec59
+  metric: false
+  persist: false
+- path: loc-mds-extract-isbns.transcript
+  md5: e08255fdca28f536f2aa8f2d065a26cf
+  cache: true
+  metric: false
+  persist: false
+md5: c802e190e0783e771f514ddb7aed5f12

+ 7
- 5
index/loc-mds-index-books.dvc

@@ -1,18 +1,20 @@
-md5: aebb364d363e41f09f68258b2d98b75b
+md5: 05b74972708103f90ede481ffc1827cf
 cmd: python ../run.py sql-script loc-mds-index-books.sql
 deps:
 - path: loc-mds-index-books.sql
-  md5: 4388f828b7802baf1cc17d0849babd72
+  md5: cdd4dabcd4f7b9b9f4eca3b83f1eb676
 - path: pgstat://loc-mds-books
-  md5: 0de2d856ffd1f5d08b2a2fa55b8c9098
+  md5: f6e0026b4d4fe4bac7056c7fe0491259
+- path: pgstat://loc-mds-extract-isbns
+  md5: c4ceff988a5b8a7c15ca00c0dbc4ec59
 outs:
 - path: pgstat://loc-mds-index-books
   cache: false
-  md5: d638b9a4a77a99749f2214e6400d85a1
+  md5: abdf7eecd1861c7318b15a2b32435204
   metric: false
   persist: false
 - path: loc-mds-index-books.transcript
   cache: true
   metric: false
   persist: false
-  md5: db797289a0adc713c3e2dbe14df73ffd
+  md5: 0af5f4abe6b224d06ac8a2950d22ee61

+ 6
- 16
index/loc-mds-index-books.sql

@@ -1,4 +1,5 @@
 --- #dep loc-mds-books
+--- #dep loc-mds-extract-isbns
 --- #table locmds.book_marc_cn
 --- #table locmds.book_record_info
 --- #table locmds.book
@@ -77,28 +78,17 @@ CREATE INDEX IF NOT EXISTS book_control_idx ON locmds.book (marc_cn);
 CREATE INDEX IF NOT EXISTS book_lccn_idx ON locmds.book (lccn);
 ANALYZE locmds.book;
 
---- #step Index ISBNs
-DROP MATERIALIZED VIEW IF EXISTS locmds.book_rec_isbn;
-DROP MATERIALIZED VIEW IF EXISTS locmds.book_extracted_isbn;
-DROP MATERIALIZED VIEW IF EXISTS locmds.book_raw_isbn;
-CREATE MATERIALIZED VIEW locmds.book_raw_isbn
-AS SELECT rec_id, contents AS isbn_text
-   FROM locmds.book_marc_field
-   WHERE tag = '020' AND sf_code = 'a';
-
-CREATE MATERIALIZED VIEW locmds.book_extracted_isbn AS
-  SELECT rec_id, regexp_replace(m[1], '[- ]', '', 'g') AS isbn, trim(m[2]) AS descr
-  FROM locmds.book_raw_isbn,
-    regexp_matches(trim(isbn_text), '(?:^|ISBN\s+)(?:[a-z]\s+|\(\d+\)\s+|\*)?([0-9 -]+[Xx]?)(?:\s*\((.+?)\))?', 'g') AS m;
-
+--- #step Index and link ISBNs
 INSERT INTO isbn_id (isbn)
-  WITH isbns AS (SELECT DISTINCT isbn FROM locmds.book_extracted_isbn WHERE isbn IS NOT NULL AND char_length(isbn) IN (10,13))
+  WITH isbns AS (SELECT DISTINCT isbn
+                 FROM locmds.book_extracted_isbn
+                 WHERE isbn IS NOT NULL)
   SELECT isbn FROM isbns
   WHERE isbn NOT IN (SELECT isbn FROM isbn_id);
 ANALYZE isbn_id;
 
 CREATE MATERIALIZED VIEW locmds.book_rec_isbn
-  AS SELECT rec_id, isbn_id
+  AS SELECT DISTINCT rec_id, isbn_id
      FROM locmds.book JOIN locmds.book_extracted_isbn USING (rec_id) JOIN isbn_id USING (isbn)
      WHERE isbn IS NOT NULl AND char_length(isbn) IN (10,13);
 CREATE INDEX IF NOT EXISTS book_rec_isbn_rec_idx ON locmds.book_rec_isbn (rec_id);

+ 1
- 3
integrate/author-info.dvc

@@ -1,10 +1,8 @@
-md5: 84aff4ccd0c4b8ca29f83e54c28ebc52
+md5: 1480e59d2b372599bc362d94e7a66a2f
 cmd: python ../run.py sql-script author-info.sql
 deps:
 - path: pgstat://loc-mds-index-books
   md5: d638b9a4a77a99749f2214e6400d85a1
-- path: pgstat://loc-mds-book-info
-  md5: be165ab9a7023147ad3f91248839562f
 - path: pgstat://viaf-index
   md5: 93814ea8630c4e4e9a3ce388a990c2c8
 - path: pgstat://cluster

+ 5
- 5
schemas/loc-mds-schema.dvc

@@ -1,17 +1,17 @@
-md5: 9ad1cf2a216b6cda63b314e177991d68
+md5: 55cde8d364fbc8502fd2b3b480187605
 cmd: python ../run.py sql-script loc-mds-schema.sql
 deps:
-- md5: 790f6cdaaf763aeddfac6b0f6df269c4
+- md5: 89549eac0d9232917eb62681b8aeaf1e
   path: loc-mds-schema.sql
-- md5: 26ef05cd212ff34511efa3cd5114d82c
+- md5: f12f086695a243c125731397d4a31bf7
   path: pgstat://common-schema
 outs:
 - path: pgstat://loc-mds-schema
   cache: false
-  md5: aec051062654d5f98dc0ccb72a0ababa
+  md5: e63399b7692987ecd6b579066e5bd35e
   metric: false
   persist: false
-- md5: 9198114bc7699b93b9ed2db7f258dac7
+- md5: d06218293061d6ef25f71c581c11bbcd
   path: loc-mds-schema.transcript
   cache: true
   metric: false

+ 13
- 0
schemas/loc-mds-schema.sql

@@ -15,6 +15,19 @@ CREATE TABLE locmds.book_marc_field (
   contents VARCHAR
 );
 
+DROP VIEW IF EXISTS locmds.book_raw_isbn CASCADE;
+CREATE VIEW locmds.book_raw_isbn
+AS SELECT rec_id, trim(contents) AS isbn_text
+   FROM locmds.book_marc_field
+   WHERE tag = '020' AND sf_code = 'a';
+
+DROP TABLE IF EXISTS locmds.book_extracted_isbn CASCADE;
+CREATE TABLE locmds.book_extracted_isbn (
+  rec_id INTEGER NOT NULL,
+  isbn VARCHAR NOT NULL,
+  isbn_tag VARCHAR
+);
+
 DROP TABLE IF EXISTS locmds.name_marc_field CASCADE;
 CREATE TABLE locmds.name_marc_field (
   rec_id INTEGER NOT NULL,