Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

export-goodreads.py 3.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  1. """
  2. Export GoodReads-specific data from the book data tools.
  3. Usage:
  4. export.py --book-ids
  5. export.py --work-titles
  6. export.py --work-ratings [--implicit]
  7. """
  8. from pathlib import Path
  9. from docopt import docopt
  10. import pandas as pd
  11. from bookdata import script_log
  12. from bookdata import db
  13. _log = script_log('export-goodreads')
  14. def export_book_ids():
  15. query = '''
  16. SELECT gr_book_rid, gr_book_id, gr_work_id, cluster AS book_id
  17. FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
  18. ORDER BY gr_book_rid
  19. '''
  20. with db.connect() as dbc:
  21. _log.info('reading book IDs')
  22. books = db.load_table(dbc, query)
  23. csv_fn = 'gr-book-ids.csv.gz'
  24. pq_fn = 'gr-book-ids.parquet'
  25. _log.info('writing CSV to %s', csv_fn)
  26. books.to_csv(csv_fn, index=False)
  27. _log.info('writing parquet to %s', pq_fn)
  28. books.to_parquet(pq_fn, index=False, compression='gzip')
  29. def export_work_titles():
  30. query = f'''
  31. SELECT gr_work_rid, gr_work_id, work_title
  32. FROM gr.work_title
  33. ORDER BY gr_work_rid
  34. '''
  35. with db.connect() as dbc:
  36. _log.info('reading work titles')
  37. books = db.load_table(dbc, query)
  38. pq_fn = 'gr-work-titles.parquet'
  39. _log.info('writing parquet to %s', pq_fn)
  40. books.to_parquet(pq_fn, index=False, compression='brotli')
  41. def export_work_actions():
  42. query = '''
  43. SELECT gr_user_rid AS user,
  44. COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS item,
  45. COUNT(rating) AS nactions,
  46. MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
  47. MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
  48. FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
  49. GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
  50. ORDER BY MIN(date_updated)
  51. '''
  52. with db.connect() as dbc:
  53. _log.info('reading book shelf actions')
  54. actions = db.load_table(dbc, query, dtype={
  55. 'user': 'i4',
  56. 'item': 'i4',
  57. 'nactions': 'i4'
  58. })
  59. _log.info('writing actions')
  60. actions.to_parquet('gr-work-actions.parquet', index=False, compression='brotli')
  61. def export_work_ratings():
  62. query = '''
  63. SELECT gr_user_rid AS user,
  64. COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS item,
  65. MEDIAN(rating) AS rating,
  66. (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
  67. MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
  68. COUNT(rating) AS nratings
  69. FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
  70. WHERE rating > 0
  71. GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
  72. ORDER BY MIN(date_updated)
  73. '''
  74. with db.connect() as dbc:
  75. _log.info('reading book ratings')
  76. ratings = db.load_table(dbc, query, dtype={
  77. 'user': 'i4',
  78. 'item': 'i4',
  79. 'rating': 'f4',
  80. 'last_rating': 'f4',
  81. 'nratings': 'i4'
  82. })
  83. _log.info('writing ratings')
  84. ratings.to_parquet('gr-work-ratings.parquet', index=False, compression='brotli')
  85. args = docopt(__doc__)
  86. if args['--book-ids']:
  87. export_book_ids()
  88. if args['--work-titles']:
  89. export_work_titles()
  90. if args['--work-ratings']:
  91. if args['--implicit']:
  92. export_work_actions()
  93. else:
  94. export_work_ratings()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...