Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

author-info.sql 5.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  1. --- Schema for consolidating and calibrating author gender info
  2. --- #step Create functions
  3. CREATE OR REPLACE FUNCTION merge_gender(cgender VARCHAR, ngender VARCHAR) RETURNS VARCHAR
  4. IMMUTABLE STRICT PARALLEL SAFE
  5. AS $$ SELECT CASE
  6. WHEN ngender = 'unknown' OR ngender IS NULL THEN cgender
  7. WHEN cgender = 'unknown' THEN ngender
  8. WHEN cgender = ngender THEN ngender
  9. ELSE 'ambiguous'
  10. END
  11. $$ LANGUAGE SQL;
  12. DROP AGGREGATE IF EXISTS resolve_gender(VARCHAR);
  13. CREATE AGGREGATE resolve_gender(gender VARCHAR) (
  14. SFUNC = merge_gender,
  15. STYPE = VARCHAR,
  16. INITCOND = 'unknown'
  17. );
  18. --- #step Compute author genders for LOC clusters
  19. DROP TABLE IF EXISTS locmds.cluster_author_gender;
  20. CREATE TABLE locmds.cluster_author_gender
  21. AS SELECT cluster,
  22. case when count(an.name) = 0 then 'no-loc-author'
  23. when count(vn.rec_id) = 0 then 'no-viaf-author'
  24. when count(vg.gender) = 0 then 'no-gender'
  25. else resolve_gender(vg.gender)
  26. end AS gender
  27. FROM locmds.isbn_cluster
  28. JOIN locmds.book_rec_isbn USING (isbn_id)
  29. LEFT JOIN locmds.book_author_name an USING (rec_id)
  30. LEFT JOIN viaf.author_name vn USING (name)
  31. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  32. GROUP BY cluster;
  33. CREATE UNIQUE INDEX loc_cluster_author_gender_book_idx ON locmds.cluster_author_gender (cluster);
  34. --- #step Compute author genders for global clusters, using LOC author records
  35. DROP TABLE IF EXISTS cluster_loc_author_gender;
  36. CREATE TABLE cluster_loc_author_gender
  37. AS SELECT cluster,
  38. case
  39. when count(an.name) = 0 then 'no-loc-author'
  40. when count(vn.rec_id) = 0 then 'no-viaf-author'
  41. when count(vg.gender) = 0 then 'no-gender'
  42. else resolve_gender(vg.gender)
  43. end AS gender
  44. FROM isbn_cluster
  45. JOIN locmds.book_rec_isbn ri USING (isbn_id)
  46. LEFT JOIN locmds.book_author_name an USING (rec_id)
  47. LEFT JOIN viaf.author_name vn USING (name)
  48. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  49. GROUP BY cluster;
  50. CREATE UNIQUE INDEX cluster_loc_author_gender_book_idx ON cluster_loc_author_gender (cluster);
  51. --- #step Create MV of rated books
  52. DROP MATERIALIZED VIEW IF EXISTS rated_book CASCADE;
  53. CREATE MATERIALIZED VIEW rated_book AS
  54. SELECT DISTINCT cluster, isbn_id
  55. FROM (SELECT book_id AS cluster FROM bx.add_action
  56. UNION DISTINCT
  57. SELECT book_id AS cluster FROM az.rating
  58. UNION DISTINCT
  59. SELECT book_id AS cluster FROM gr.add_action) rated
  60. LEFT JOIN isbn_cluster USING (cluster);
  61. CREATE INDEX rated_book_cluster_idx ON rated_book (cluster);
  62. CREATE INDEX rated_book_isbn_idx ON rated_book (isbn_id);
  63. ANALYZE rated_book;
  64. --- #step Extract book author names from OL
  65. DROP MATERIALIZED VIEW IF EXISTS cluster_ol_author_name;
  66. CREATE MATERIALIZED VIEW cluster_ol_author_name AS
  67. SELECT DISTINCT cluster, author_name
  68. FROM isbn_cluster
  69. JOIN ol.isbn_link USING (isbn_id)
  70. JOIN ol.edition USING (edition_id)
  71. JOIN ol.edition_author USING (edition_id)
  72. JOIN ol.author_name USING (author_id);
  73. --- #step Extract book first-author names from OL
  74. DROP MATERIALIZED VIEW IF EXISTS cluster_ol_first_author_name;
  75. CREATE MATERIALIZED VIEW cluster_ol_first_author_name AS
  76. SELECT DISTINCT cluster, author_name
  77. FROM isbn_cluster
  78. JOIN ol.isbn_link USING (isbn_id)
  79. JOIN ol.edition USING (edition_id)
  80. JOIN ol.edition_first_author USING (edition_id)
  81. JOIN ol.author_name USING (author_id);
  82. --- #step Extract book first-author names from LOC MDS
  83. DROP MATERIALIZED VIEW IF EXISTS cluster_loc_first_author_name;
  84. CREATE MATERIALIZED VIEW cluster_loc_author_name AS
  85. SELECT DISTINCT cluster, name AS author_name
  86. FROM isbn_cluster
  87. JOIN locmds.book_rec_isbn USING (isbn_id)
  88. JOIN locmds.book_author_name USING (rec_id);
  89. --- #step Extract book first-author names from all available book-author data
  90. DROP MATERIALIZED VIEW IF EXISTS cluster_first_author_name;
  91. CREATE MATERIALIZED VIEW cluster_first_author_name AS
  92. SELECT cluster, author_name FROM cluster_loc_first_author_name
  93. UNION DISTINCT
  94. SELECT cluster, author_name FROM cluster_ol_first_author_name;
  95. CREATE INDEX cluster_first_author_name_cluster_idx ON cluster_first_author_name (cluster);
  96. CREATE INDEX cluster_first_author_name_idx ON cluster_first_author_name (author_name);
  97. ANALYZE cluster_first_author_name;
  98. --- #step Compute genders of first authors form all available data
  99. DROP TABLE IF EXISTS cluster_first_author_gender;
  100. CREATE TABLE cluster_first_author_gender
  101. AS SELECT cluster,
  102. case
  103. when count(an.author_name) = 0 then 'no-loc-author'
  104. when count(vn.rec_id) = 0 then 'no-viaf-author'
  105. when count(vg.gender) = 0 then 'no-gender'
  106. else resolve_gender(vg.gender)
  107. end AS gender
  108. FROM (SELECT DISTINCT cluster FROM isbn_cluster WHERE cluster < bc_of_isbn(0)) cl -- ISBN-only recs aren't useful
  109. LEFT JOIN cluster_first_author_name an USING (cluster)
  110. LEFT JOIN viaf.author_name vn ON (name = author_name)
  111. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  112. GROUP BY cluster;
  113. CREATE UNIQUE INDEX cluster_first_author_gender_book_idx ON cluster_first_author_gender (cluster);
  114. ANALYZE cluster_first_author_gender;
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...