Browse Source

ISBN normalization

Michael Ekstrand 10 months ago
parent
commit
9ef8ccbffe
2 changed files with 26 additions and 0 deletions
  1. 10
    0
      index/isbn-norm.dvc
  2. 16
    0
      index/isbn-norm.sql

+ 10
- 0
index/isbn-norm.dvc

@@ -0,0 +1,10 @@
+cmd: python ../run.py sql-script isbn-norm.sql
+deps:
+- path: isbn-norm.sql
+- path: pgstat://loc-mds-index-books
+- path: pgstat://gr-index-books
+- path: pgstat://ol-index
+outs:
+- path: isbn-norm.transcript
+- path: pgstat://isbn-norm
+  cache: false

+ 16
- 0
index/isbn-norm.sql

@@ -0,0 +1,16 @@
+--- #dep loc-mds-index-books
+--- #dep gr-index-books
+--- #dep ol-index
+
+--- #step Extract normalized ISBNs
+DROP TABLE IF EXISTS isbn_norm CASCADE;
+CREATE TABLE isbn_norm (
+    isbn_id INTEGER PRIMARY KEY,
+    norm_isbn EAN13 NOT NULL
+);
+INSERT INTO isbn_norm
+SELECT isbn_id, make_valid(isbn(isbn || '!'))
+FROM isbn_id WHERE isbn ~ '^\d{9}[\dxX]$';
+INSERT INTO isbn_norm
+SELECT isbn_id, make_valid(ean13(regexp_replace(isbn, '[xX]$', '0') || '!'))
+FROM isbn_id WHERE isbn ~ '^9\d{11}[\dxX]$';