Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

author-info.sql 3.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  1. --- #dep bx-index
  2. --- #dep az-index
  3. --- #dep gr-index-ratings
  4. --- #dep cluster
  5. --- #dep loc-mds-cluster
  6. --- #dep viaf-index
  7. --- Schema for consolidating and calibrating author gender info
  8. --- #step Create functions
  9. CREATE OR REPLACE FUNCTION merge_gender(cgender VARCHAR, ngender VARCHAR) RETURNS VARCHAR
  10. IMMUTABLE STRICT PARALLEL SAFE
  11. AS $$ SELECT CASE
  12. WHEN ngender = 'unknown' OR ngender IS NULL THEN cgender
  13. WHEN cgender = 'unknown' THEN ngender
  14. WHEN cgender = ngender THEN ngender
  15. ELSE 'ambiguous'
  16. END
  17. $$ LANGUAGE SQL;
  18. DROP AGGREGATE IF EXISTS resolve_gender(VARCHAR);
  19. CREATE AGGREGATE resolve_gender(gender VARCHAR) (
  20. SFUNC = merge_gender,
  21. STYPE = VARCHAR,
  22. INITCOND = 'unknown'
  23. );
  24. --- #step Extract book first-author names from OL
  25. DROP MATERIALIZED VIEW IF EXISTS cluster_ol_first_author_name;
  26. CREATE MATERIALIZED VIEW cluster_ol_first_author_name AS
  27. SELECT DISTINCT cluster, author_name
  28. FROM isbn_cluster
  29. JOIN ol.isbn_link USING (isbn_id)
  30. JOIN ol.edition USING (edition_id)
  31. JOIN ol.edition_first_author USING (edition_id)
  32. JOIN ol.author_name USING (author_id);
  33. --- #step Extract book first-author names from LOC MDS
  34. DROP MATERIALIZED VIEW IF EXISTS cluster_loc_first_author_name;
  35. CREATE MATERIALIZED VIEW cluster_loc_first_author_name AS
  36. SELECT DISTINCT cluster, name AS author_name
  37. FROM isbn_cluster
  38. JOIN locmds.book_rec_isbn USING (isbn_id)
  39. JOIN locmds.book_author_name USING (rec_id);
  40. --- #step Extract book first-author names from all available book-author data
  41. DROP MATERIALIZED VIEW IF EXISTS cluster_first_author_name;
  42. CREATE MATERIALIZED VIEW cluster_first_author_name AS
  43. SELECT cluster, author_name FROM cluster_loc_first_author_name
  44. UNION DISTINCT
  45. SELECT cluster, author_name FROM cluster_ol_first_author_name;
  46. CREATE INDEX IF NOT EXISTS cluster_first_author_name_cluster_idx ON cluster_first_author_name (cluster);
  47. CREATE INDEX IF NOT EXISTS cluster_first_author_name_idx ON cluster_first_author_name (author_name);
  48. ANALYZE cluster_first_author_name;
  49. --- #step Compute genders of first authors form all available data
  50. CREATE TABLE IF NOT EXISTS cluster_first_author_gender (
  51. cluster INTEGER NOT NULL,
  52. gender VARCHAR NOT NULL
  53. );
  54. TRUNCATE cluster_first_author_gender;
  55. INSERT INTO cluster_first_author_gender
  56. SELECT cluster,
  57. case
  58. when count(an.author_name) = 0 then 'no-loc-author'
  59. when count(vn.rec_id) = 0 then 'no-viaf-author'
  60. when count(vg.gender) = 0 then 'no-gender'
  61. else resolve_gender(vg.gender)
  62. end AS gender
  63. FROM (SELECT DISTINCT cluster FROM isbn_cluster WHERE cluster < bc_of_isbn(0)) cl -- ISBN-only recs aren't useful
  64. LEFT JOIN cluster_first_author_name an USING (cluster)
  65. LEFT JOIN viaf.author_name vn ON (name = author_name)
  66. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  67. GROUP BY cluster;
  68. CREATE UNIQUE INDEX IF NOT EXISTS cluster_first_author_gender_book_idx ON cluster_first_author_gender (cluster);
  69. ANALYZE cluster_first_author_gender;
  70. --- #step Save stage deps
  71. INSERT INTO stage_dep (stage_name, dep_name, dep_key)
  72. SELECT 'author-info', stage_name, stage_key
  73. FROM stage_status
  74. WHERE stage_name IN ('cluster', 'loc-mds-cluster');
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...