Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

author-info.sql 4.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  1. --- #dep bx-index
  2. --- #dep az-index
  3. --- #dep gr-index-ratings
  4. --- #dep cluster
  5. --- #dep loc-mds-cluster
  6. --- #dep viaf-index
  7. --- Schema for consolidating and calibrating author gender info
  8. --- #step Create functions
  9. CREATE OR REPLACE FUNCTION merge_gender(cgender VARCHAR, ngender VARCHAR) RETURNS VARCHAR
  10. IMMUTABLE STRICT PARALLEL SAFE
  11. AS $$ SELECT CASE
  12. WHEN ngender = 'unknown' OR ngender IS NULL THEN cgender
  13. WHEN cgender = 'unknown' THEN ngender
  14. WHEN cgender = ngender THEN ngender
  15. ELSE 'ambiguous'
  16. END
  17. $$ LANGUAGE SQL;
  18. DROP AGGREGATE IF EXISTS resolve_gender(VARCHAR);
  19. CREATE AGGREGATE resolve_gender(gender VARCHAR) (
  20. SFUNC = merge_gender,
  21. STYPE = VARCHAR,
  22. INITCOND = 'unknown'
  23. );
  24. --- #step Compute author genders for LOC clusters
  25. --- #allow duplicate_table
  26. CREATE TABLE locmds.cluster_author_gender
  27. AS SELECT cluster,
  28. case when count(an.name) = 0 then 'no-loc-author'
  29. when count(vn.rec_id) = 0 then 'no-viaf-author'
  30. when count(vg.gender) = 0 then 'no-gender'
  31. else resolve_gender(vg.gender)
  32. end AS gender
  33. FROM locmds.isbn_cluster
  34. JOIN locmds.book_rec_isbn USING (isbn_id)
  35. LEFT JOIN locmds.book_author_name an USING (rec_id)
  36. LEFT JOIN viaf.author_name vn USING (name)
  37. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  38. GROUP BY cluster;
  39. CREATE UNIQUE INDEX loc_cluster_author_gender_book_idx ON locmds.cluster_author_gender (cluster);
  40. --- #step Create MV of rated books
  41. --- #allow duplicate_table
  42. CREATE MATERIALIZED VIEW rated_book AS
  43. SELECT DISTINCT cluster, isbn_id
  44. FROM (SELECT book_id AS cluster FROM bx.add_action
  45. UNION DISTINCT
  46. SELECT book_id AS cluster FROM az.rating
  47. UNION DISTINCT
  48. SELECT book_id AS cluster FROM gr.add_action) rated
  49. LEFT JOIN isbn_cluster USING (cluster);
  50. CREATE INDEX rated_book_cluster_idx ON rated_book (cluster);
  51. CREATE INDEX rated_book_isbn_idx ON rated_book (isbn_id);
  52. ANALYZE rated_book;
  53. --- #step Extract book first-author names from OL
  54. DROP MATERIALIZED VIEW IF EXISTS cluster_ol_first_author_name;
  55. CREATE MATERIALIZED VIEW cluster_ol_first_author_name AS
  56. SELECT DISTINCT cluster, author_name
  57. FROM isbn_cluster
  58. JOIN ol.isbn_link USING (isbn_id)
  59. JOIN ol.edition USING (edition_id)
  60. JOIN ol.edition_first_author USING (edition_id)
  61. JOIN ol.author_name USING (author_id);
  62. --- #step Extract book first-author names from LOC MDS
  63. DROP MATERIALIZED VIEW IF EXISTS cluster_loc_first_author_name;
  64. CREATE MATERIALIZED VIEW cluster_loc_first_author_name AS
  65. SELECT DISTINCT cluster, name AS author_name
  66. FROM isbn_cluster
  67. JOIN locmds.book_rec_isbn USING (isbn_id)
  68. JOIN locmds.book_author_name USING (rec_id);
  69. --- #step Extract book first-author names from all available book-author data
  70. DROP MATERIALIZED VIEW IF EXISTS cluster_first_author_name;
  71. CREATE MATERIALIZED VIEW cluster_first_author_name AS
  72. SELECT cluster, author_name FROM cluster_loc_first_author_name
  73. UNION DISTINCT
  74. SELECT cluster, author_name FROM cluster_ol_first_author_name;
  75. CREATE INDEX cluster_first_author_name_cluster_idx ON cluster_first_author_name (cluster);
  76. CREATE INDEX cluster_first_author_name_idx ON cluster_first_author_name (author_name);
  77. ANALYZE cluster_first_author_name;
  78. --- #step Compute genders of first authors form all available data
  79. CREATE TABLE IF NOT EXISTS cluster_first_author_gender
  80. AS SELECT cluster,
  81. case
  82. when count(an.author_name) = 0 then 'no-loc-author'
  83. when count(vn.rec_id) = 0 then 'no-viaf-author'
  84. when count(vg.gender) = 0 then 'no-gender'
  85. else resolve_gender(vg.gender)
  86. end AS gender
  87. FROM (SELECT DISTINCT cluster FROM isbn_cluster WHERE cluster < bc_of_isbn(0)) cl -- ISBN-only recs aren't useful
  88. LEFT JOIN cluster_first_author_name an USING (cluster)
  89. LEFT JOIN viaf.author_name vn ON (name = author_name)
  90. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  91. GROUP BY cluster;
  92. CREATE UNIQUE INDEX IF NOT EXISTS cluster_first_author_gender_book_idx ON cluster_first_author_gender (cluster);
  93. ANALYZE cluster_first_author_gender;
  94. --- #step Save stage deps
  95. INSERT INTO stage_dep (stage_name, dep_name, dep_key)
  96. SELECT 'author-info', stage_name, stage_key
  97. FROM stage_status
  98. WHERE stage_name IN ('cluster', 'loc-mds-cluster');
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...