Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

author-info.sql 3.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  1. --- #dep bx-index
  2. --- #dep az-index
  3. --- #dep gr-index-ratings
  4. --- #dep cluster
  5. --- #dep loc-mds-cluster
  6. --- #dep viaf-index
  7. --- Schema for consolidating and calibrating author gender info
  8. --- #step Create functions
  9. CREATE OR REPLACE FUNCTION merge_gender(cgender VARCHAR, ngender VARCHAR) RETURNS VARCHAR
  10. IMMUTABLE STRICT PARALLEL SAFE
  11. AS $$ SELECT CASE
  12. WHEN ngender = 'unknown' OR ngender IS NULL THEN cgender
  13. WHEN cgender = 'unknown' THEN ngender
  14. WHEN cgender = ngender THEN ngender
  15. ELSE 'ambiguous'
  16. END
  17. $$ LANGUAGE SQL;
  18. DROP AGGREGATE IF EXISTS resolve_gender(VARCHAR);
  19. CREATE AGGREGATE resolve_gender(gender VARCHAR) (
  20. SFUNC = merge_gender,
  21. STYPE = VARCHAR,
  22. INITCOND = 'unknown'
  23. );
  24. --- #step Create MV of rated books
  25. CREATE MATERIALIZED VIEW IF NOT EXISTS rated_book AS
  26. SELECT DISTINCT cluster, isbn_id
  27. FROM (SELECT book_id AS cluster FROM bx.add_action
  28. UNION DISTINCT
  29. SELECT book_id AS cluster FROM az.rating) rated
  30. LEFT JOIN isbn_cluster USING (cluster) WITH NO DATA;
  31. REFRESH MATERIALIZED VIEW rated_book;
  32. CREATE INDEX IF NOT EXISTS rated_book_cluster_idx ON rated_book (cluster);
  33. CREATE INDEX IF NOT EXISTS rated_book_isbn_idx ON rated_book (isbn_id);
  34. ANALYZE rated_book;
  35. --- #step Extract book first-author names from OL
  36. DROP MATERIALIZED VIEW IF EXISTS cluster_ol_first_author_name;
  37. CREATE MATERIALIZED VIEW cluster_ol_first_author_name AS
  38. SELECT DISTINCT cluster, author_name
  39. FROM isbn_cluster
  40. JOIN ol.isbn_link USING (isbn_id)
  41. JOIN ol.edition USING (edition_id)
  42. JOIN ol.edition_first_author USING (edition_id)
  43. JOIN ol.author_name USING (author_id);
  44. --- #step Extract book first-author names from LOC MDS
  45. DROP MATERIALIZED VIEW IF EXISTS cluster_loc_first_author_name;
  46. CREATE MATERIALIZED VIEW cluster_loc_first_author_name AS
  47. SELECT DISTINCT cluster, name AS author_name
  48. FROM isbn_cluster
  49. JOIN locmds.book_rec_isbn USING (isbn_id)
  50. JOIN locmds.book_author_name USING (rec_id);
  51. --- #step Extract book first-author names from all available book-author data
  52. DROP MATERIALIZED VIEW IF EXISTS cluster_first_author_name;
  53. CREATE MATERIALIZED VIEW cluster_first_author_name AS
  54. SELECT cluster, author_name FROM cluster_loc_first_author_name
  55. UNION DISTINCT
  56. SELECT cluster, author_name FROM cluster_ol_first_author_name;
  57. CREATE INDEX IF NOT EXISTS cluster_first_author_name_cluster_idx ON cluster_first_author_name (cluster);
  58. CREATE INDEX IF NOT EXISTS cluster_first_author_name_idx ON cluster_first_author_name (author_name);
  59. ANALYZE cluster_first_author_name;
  60. --- #step Compute genders of first authors form all available data
  61. CREATE TABLE IF NOT EXISTS cluster_first_author_gender (
  62. cluster INTEGER NOT NULL,
  63. gender VARCHAR NOT NULL
  64. );
  65. TRUNCATE cluster_first_author_gender;
  66. INSERT INTO cluster_first_author_gender
  67. SELECT cluster,
  68. case
  69. when count(an.author_name) = 0 then 'no-loc-author'
  70. when count(vn.rec_id) = 0 then 'no-viaf-author'
  71. when count(vg.gender) = 0 then 'no-gender'
  72. else resolve_gender(vg.gender)
  73. end AS gender
  74. FROM (SELECT DISTINCT cluster FROM isbn_cluster WHERE cluster < bc_of_isbn(0)) cl -- ISBN-only recs aren't useful
  75. LEFT JOIN cluster_first_author_name an USING (cluster)
  76. LEFT JOIN viaf.author_name vn ON (name = author_name)
  77. LEFT JOIN viaf.author_gender vg ON (vn.rec_id = vg.rec_id)
  78. GROUP BY cluster;
  79. CREATE UNIQUE INDEX IF NOT EXISTS cluster_first_author_gender_book_idx ON cluster_first_author_gender (cluster);
  80. ANALYZE cluster_first_author_gender;
  81. --- #step Save stage deps
  82. INSERT INTO stage_dep (stage_name, dep_name, dep_key)
  83. SELECT 'author-info', stage_name, stage_key
  84. FROM stage_status
  85. WHERE stage_name IN ('cluster', 'loc-mds-cluster');
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...