Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

author-info.sql 4.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
  1. --- #dep bx-index
  2. --- #dep az-index
  3. --- #dep gr-index-ratings
  4. --- #dep cluster
  5. --- #dep loc-mds-cluster
  6. --- #dep viaf-index
  7. --- Schema for consolidating and calibrating author gender info
  8. --- #step Create functions
  9. CREATE OR REPLACE FUNCTION merge_gender(cgender VARCHAR, ngender VARCHAR) RETURNS VARCHAR
  10. IMMUTABLE STRICT PARALLEL SAFE
  11. AS $$ SELECT CASE
  12. WHEN ngender = 'unknown' OR ngender IS NULL THEN cgender
  13. WHEN cgender = 'unknown' THEN ngender
  14. WHEN cgender = ngender THEN ngender
  15. ELSE 'ambiguous'
  16. END
  17. $$ LANGUAGE SQL;
  18. DROP AGGREGATE IF EXISTS resolve_gender(VARCHAR);
  19. CREATE AGGREGATE resolve_gender(gender VARCHAR) (
  20. SFUNC = merge_gender,
  21. STYPE = VARCHAR,
  22. INITCOND = 'unknown'
  23. );
  24. --- #step Compute author genders for LOC clusters
  25. CREATE TABLE IF NOT EXISTS locmds.cluster_author_gender (
  26. cluster INTEGER NOT NULL,
  27. gender VARCHAR NOT NULL
  28. );
  29. TRUNCATE locmds.cluster_author_gender;
  30. INSERT INTO locmds.cluster_author_gender (cluster, gender)
  31. SELECT cluster,
  32. case when count(an.name) = 0 then 'no-loc-author'
  33. when count(vn.rec_id) = 0 then 'no-viaf-author'
  34. when count(vg.gender) = 0 then 'no-gender'
  35. else resolve_gender(vg.gender)
  36. end AS gender
  37. FROM locmds.isbn_cluster
  38. JOIN locmds.book_rec_isbn USING (isbn_id)
  39. LEFT JOIN locmds.book_author_name an USING (rec_id)
  40. LEFT JOIN viaf.author_name vn USING (name)
  41. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  42. GROUP BY cluster;
  43. CREATE UNIQUE INDEX IF NOT EXISTS loc_cluster_author_gender_book_idx ON locmds.cluster_author_gender (cluster);
  44. --- #step Create MV of rated books
  45. CREATE MATERIALIZED VIEW IF NOT EXISTS rated_book AS
  46. SELECT DISTINCT cluster, isbn_id
  47. FROM (SELECT book_id AS cluster FROM bx.add_action
  48. UNION DISTINCT
  49. SELECT book_id AS cluster FROM az.rating) rated
  50. LEFT JOIN isbn_cluster USING (cluster) WITH NO DATA;
  51. REFRESH MATERIALIZED VIEW rated_book;
  52. CREATE INDEX IF NOT EXISTS rated_book_cluster_idx ON rated_book (cluster);
  53. CREATE INDEX IF NOT EXISTS rated_book_isbn_idx ON rated_book (isbn_id);
  54. ANALYZE rated_book;
  55. --- #step Extract book first-author names from OL
  56. DROP MATERIALIZED VIEW IF EXISTS cluster_ol_first_author_name;
  57. CREATE MATERIALIZED VIEW cluster_ol_first_author_name AS
  58. SELECT DISTINCT cluster, author_name
  59. FROM isbn_cluster
  60. JOIN ol.isbn_link USING (isbn_id)
  61. JOIN ol.edition USING (edition_id)
  62. JOIN ol.edition_first_author USING (edition_id)
  63. JOIN ol.author_name USING (author_id);
  64. --- #step Extract book first-author names from LOC MDS
  65. DROP MATERIALIZED VIEW IF EXISTS cluster_loc_first_author_name;
  66. CREATE MATERIALIZED VIEW cluster_loc_first_author_name AS
  67. SELECT DISTINCT cluster, name AS author_name
  68. FROM isbn_cluster
  69. JOIN locmds.book_rec_isbn USING (isbn_id)
  70. JOIN locmds.book_author_name USING (rec_id);
  71. --- #step Extract book first-author names from all available book-author data
  72. DROP MATERIALIZED VIEW IF EXISTS cluster_first_author_name;
  73. CREATE MATERIALIZED VIEW cluster_first_author_name AS
  74. SELECT cluster, author_name FROM cluster_loc_first_author_name
  75. UNION DISTINCT
  76. SELECT cluster, author_name FROM cluster_ol_first_author_name;
  77. CREATE INDEX IF NOT EXISTS cluster_first_author_name_cluster_idx ON cluster_first_author_name (cluster);
  78. CREATE INDEX IF NOT EXISTS cluster_first_author_name_idx ON cluster_first_author_name (author_name);
  79. ANALYZE cluster_first_author_name;
  80. --- #step Compute genders of first authors form all available data
  81. CREATE TABLE IF NOT EXISTS cluster_first_author_gender (
  82. cluster INTEGER NOT NULL,
  83. gender VARCHAR NOT NULL
  84. );
  85. TRUNCATE cluster_first_author_gender;
  86. INSERT INTO cluster_first_author_gender
  87. SELECT cluster,
  88. case
  89. when count(an.author_name) = 0 then 'no-loc-author'
  90. when count(vn.rec_id) = 0 then 'no-viaf-author'
  91. when count(vg.gender) = 0 then 'no-gender'
  92. else resolve_gender(vg.gender)
  93. end AS gender
  94. FROM (SELECT DISTINCT cluster FROM isbn_cluster WHERE cluster < bc_of_isbn(0)) cl -- ISBN-only recs aren't useful
  95. LEFT JOIN cluster_first_author_name an USING (cluster)
  96. LEFT JOIN viaf.author_name vn ON (name = author_name)
  97. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  98. GROUP BY cluster;
  99. CREATE UNIQUE INDEX IF NOT EXISTS cluster_first_author_gender_book_idx ON cluster_first_author_gender (cluster);
  100. ANALYZE cluster_first_author_gender;
  101. --- #step Save stage deps
  102. INSERT INTO stage_dep (stage_name, dep_name, dep_key)
  103. SELECT 'author-info', stage_name, stage_key
  104. FROM stage_status
  105. WHERE stage_name IN ('cluster', 'loc-mds-cluster');
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...