Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

goodreads.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  1. import logging
  2. from invoke import task
  3. import support as s
  4. _log = logging.getLogger(__name__)
  5. @task(s.init, s.build, name='import')
  6. def import_data(c, force=False):
  7. "Import GoodReads rating and book data"
  8. s.start('gr-data', force=force)
  9. _log.info('Resetting GoodReads schema')
  10. s.psql(c, 'gr-schema.sql')
  11. _log.info('Importing GoodReads books')
  12. s.pipeline([
  13. [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'book', s.data_dir / 'goodreads_books.json.gz']
  14. ])
  15. _log.info('Importing GoodReads works')
  16. s.pipeline([
  17. [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'work', s.data_dir / 'goodreads_book_works.json.gz']
  18. ])
  19. _log.info('Importing GoodReads authors')
  20. s.pipeline([
  21. [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'author', s.data_dir / 'goodreads_book_authors.json.gz']
  22. ])
  23. _log.info('Importing GoodReads interactions')
  24. s.pipeline([
  25. [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'interaction', s.data_dir / 'goodreads_interactions.json.gz']
  26. ])
  27. s.finish('gr-data')
  28. @task(s.init)
  29. def index_books(c, force=False):
  30. "Index GoodReads book data"
  31. s.check_prereq('gr-data')
  32. s.start('gr-index-books', force=force)
  33. _log.info('building GoodReads indexes')
  34. s.psql(c, 'gr-index-books.sql', True)
  35. s.finish('gr-index-books')
  36. @task(s.init)
  37. def index_ratings(c, force=False):
  38. "Index GoodReads rating/interaction data"
  39. s.check_prereq('gr-data')
  40. s.check_prereq('cluster')
  41. s.start('gr-index-ratings', force=force)
  42. _log.info('building GoodReads rating data indexes')
  43. s.psql(c, 'gr-index-ratings.sql', True)
  44. s.finish('gr-index-ratings')
  45. @task(s.init, s.build)
  46. def record_files(c):
  47. files = [s.data_dir / f'goodreads_{f}.json.gz' for f in ['books', 'book_works', 'book_authors', 'interactions']]
  48. s.booktool(c, 'hash', *files)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...