Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

gr-book-info.sql 4.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  1. --- #dep gr-books
  2. --- #dep gr-works
  3. --- #dep gr-book-authors
  4. --- #dep gr-index-books
  5. --- #dep gr-index-ratings
  6. --- #dep cluster
  7. --- #table gr.work_title
  8. --- #table gr.book_pub_date
  9. --- #table gr.book_interstats
  10. --- #step Create useful GR functions
  11. CREATE OR REPLACE FUNCTION try_date(year VARCHAR, month VARCHAR, day VARCHAR) RETURNS DATE
  12. IMMUTABLE RETURNS NULL ON NULL INPUT PARALLEL UNSAFE
  13. LANGUAGE plpgsql AS $$
  14. BEGIN
  15. RETURN MAKE_DATE(NULLIF(year, '')::INTEGER,
  16. NULLIF(month, '')::INTEGER,
  17. NULLIF(day, '')::INTEGER);
  18. EXCEPTION WHEN SQLSTATE '22008' THEN
  19. RETURN NULL;
  20. END;
  21. $$;
  22. --- #step Index book clusters
  23. CREATE MATERIALIZED VIEW IF NOT EXISTS gr.book_cluster
  24. AS SELECT DISTINCT gr_book_id, cluster
  25. FROM gr.book_isbn JOIN isbn_cluster USING (isbn_id);
  26. CREATE UNIQUE INDEX IF NOT EXISTS book_cluster_book_idx ON gr.book_cluster (gr_book_id);
  27. CREATE INDEX IF NOT EXISTS book_cluster_cluster_idx ON gr.book_cluster (cluster);
  28. ANALYZE gr.book_cluster;
  29. --- #step Extract GoodReads work titles
  30. DROP MATERIALIZED VIEW IF EXISTS gr.work_title;
  31. CREATE MATERIALIZED VIEW gr.work_title
  32. AS SELECT gr_work_rid, (gr_work_data->>'work_id')::int AS gr_work_id,
  33. NULLIF(gr_work_data->>'original_title', '') AS work_title
  34. FROM gr.raw_work;
  35. CREATE INDEX gr_work_title_work_idx ON gr.work_title (gr_work_id);
  36. ANALYZE gr.work_title;
  37. --- #step Extract GoodReads book titles
  38. DROP MATERIALIZED VIEW IF EXISTS gr.book_title;
  39. CREATE MATERIALIZED VIEW gr.book_title
  40. AS SELECT gr_book_rid, (gr_book_data->>'book_id')::int AS gr_book_id,
  41. NULLIF(gr_book_data->>'title', '') AS book_title
  42. FROM gr.raw_book;
  43. CREATE INDEX gr_book_title_book_idx ON gr.book_title (gr_book_id);
  44. ANALYZE gr.book_title;
  45. --- #step Extract GoodReads book publication dates
  46. DROP MATERIALIZED VIEW IF EXISTS gr.book_pub_date;
  47. CREATE MATERIALIZED VIEW gr.book_pub_date
  48. AS SELECT gr_book_rid, book_id AS gr_book_id,
  49. NULLIF(publication_year, '')::INTEGER AS pub_year,
  50. NULLIF(publication_month, '')::INTEGER AS pub_month,
  51. NULLIF(publication_day, '')::INTEGER AS pub_day,
  52. try_date(publication_year, publication_month, publication_day) AS pub_date
  53. FROM gr.raw_book,
  54. jsonb_to_record(gr_book_data) AS
  55. x(book_id INTEGER, publication_year VARCHAR,
  56. publication_month VARCHAR, publication_day VARCHAR)
  57. WHERE NULLIF(publication_year, '') IS NOT NULL;
  58. CREATE UNIQUE INDEX gr_bpd_rec_idx ON gr.book_pub_date (gr_book_rid);
  59. CREATE UNIQUE INDEX gr_bpd_book_idx ON gr.book_pub_date (gr_book_id);
  60. ANALYZE gr.book_pub_date;
  61. --- #step Extract GoodReads work original publication dates
  62. DROP MATERIALIZED VIEW IF EXISTS gr.work_pub_date;
  63. CREATE MATERIALIZED VIEW gr.work_pub_date
  64. AS SELECT gr_work_rid, work_id AS gr_work_id,
  65. NULLIF(original_publication_year, '')::INTEGER AS pub_year,
  66. NULLIF(original_publication_month, '')::INTEGER AS pub_month,
  67. NULLIF(original_publication_day, '')::INTEGER AS pub_day,
  68. try_date(original_publication_year, original_publication_month, original_publication_day) AS pub_date
  69. FROM gr.raw_work,
  70. jsonb_to_record(gr_work_data) AS
  71. x(work_id INTEGER, original_publication_year VARCHAR,
  72. original_publication_month VARCHAR, original_publication_day VARCHAR)
  73. WHERE NULLIF(original_publication_year, '') IS NOT NULL;
  74. CREATE UNIQUE INDEX gr_wpd_rec_idx ON gr.work_pub_date (gr_work_rid);
  75. CREATE UNIQUE INDEX gr_wpd_work_idx ON gr.work_pub_date (gr_work_id);
  76. ANALYZE gr.work_pub_date;
  77. --- #step Create book statistics table
  78. CREATE MATERIALIZED VIEW IF NOT EXISTS gr.book_interstats AS
  79. SELECT gr_book_id,
  80. COUNT(gr_interaction_rid) as n_shelves,
  81. COUNT(DISTINCT gr_user_rid) as n_users,
  82. COUNT(NULLIF(rating, 0)) as n_rates,
  83. AVG(NULLIF(rating, 0)) as mean_rate,
  84. SUM(CASE WHEN rating > 2 THEN 1 ELSE 0 END) as n_pos_rates
  85. FROM gr.interaction
  86. GROUP BY gr_book_id;
  87. CREATE INDEX IF NOT EXISTS gr_bis_book_idx ON gr.book_interstats (gr_book_id);
  88. ANALYZE gr.book_interstats;
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...