Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

export-goodreads.py 2.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  1. """
  2. Export GoodReads-specific data from the book data tools.
  3. Usage:
  4. export.py --book-ids
  5. export.py --work-ratings [--implicit]
  6. """
  7. from pathlib import Path
  8. from docopt import docopt
  9. import pandas as pd
  10. from bookdata import script_log
  11. from bookdata import db
  12. _log = script_log(__file__)
  13. def export_book_ids():
  14. query = '''
  15. SELECT gr_book_rid, gr_book_id, gr_work_id, cluster AS book_id
  16. FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
  17. ORDER BY gr_book_rid
  18. '''
  19. _log.info('reading book IDs')
  20. with db.connect() as dbc:
  21. books = db.load_table(dbc, query)
  22. csv_fn = 'gr-book-ids.csv.gz'
  23. pq_fn = 'gr-book-ids.parquet'
  24. _log.info('writing CSV to %s', csv_fn)
  25. books.to_csv(csv_fn, index=False)
  26. _log.info('writing parquet to %s', pq_fn)
  27. books.to_parquet(pq_fn, index=False, compression='gzip')
  28. def export_work_actions():
  29. path = data_dir / 'GR-I' / 'work-ratings.parquet'
  30. query = f'''
  31. SELECT gr_user_rid AS user_id,
  32. COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS book_id,
  33. COUNT(rating) AS nactions,
  34. MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
  35. MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
  36. FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
  37. GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
  38. ORDER BY MIN(date_updated)
  39. '''
  40. _log.info('reading book shelf actions')
  41. actions = dt.load_table(query, dtype={
  42. 'user': 'i4',
  43. 'item': 'i4',
  44. 'nactions': 'i4'
  45. })
  46. path.parent.mkdir(parents=True, exist_ok=True)
  47. _log.info('writing ratings to %s', path)
  48. actions.to_parquet(path, index=False)
  49. def export_work_ratings():
  50. path = data_dir / 'GR-E' / 'work-ratings.parquet'
  51. query = f'''
  52. SELECT gr_user_rid AS user_id,
  53. COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id)) AS book_id,
  54. MEDIAN(rating) AS rating,
  55. (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
  56. MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
  57. COUNT(rating) AS nratings
  58. FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
  59. WHERE rating > 0
  60. GROUP BY gr_user_rid, COALESCE(bc_of_gr_work(gr_work_id), bc_of_gr_book(gr_book_id))
  61. ORDER BY MIN(date_updated)
  62. '''
  63. _log.info('reading book ratings')
  64. ratings = dt.load_table(query, dtype={
  65. 'user': 'i4',
  66. 'item': 'i4',
  67. 'rating': 'f4',
  68. 'nactions': 'i4'
  69. })
  70. path.parent.mkdir(parents=True, exist_ok=True)
  71. _log.info('writing ratings to %s', path)
  72. ratings.to_parquet(path, index=False)
  73. args = docopt(__doc__)
  74. if args['--book-ids']:
  75. export_book_ids()
  76. if args['--work-ratings']:
  77. if args['--implicit']:
  78. export_work_actions()
  79. else:
  80. export_work_ratings()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...