Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

goodreads.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
  1. import logging
  2. from invoke import task
  3. import support as s
  4. _log = logging.getLogger(__name__)
  5. @task(s.init, s.build, name='import')
  6. def import_data(c, force=False):
  7. "Import GoodReads rating and book data"
  8. s.start('gr-data', force=force)
  9. _log.info('Resetting GoodReads schema')
  10. s.psql(c, 'gr-schema.sql')
  11. _log.info('Importing GoodReads books')
  12. s.pipeline([
  13. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_books.json.gz'],
  14. ['psql', '-c', '\\copy gr_raw_book (gr_book_data) FROM STDIN']
  15. ])
  16. _log.info('Importing GoodReads works')
  17. s.pipeline([
  18. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_book_works.json.gz'],
  19. ['psql', '-c', '\\copy gr_raw_work (gr_work_data) FROM STDIN']
  20. ])
  21. _log.info('Importing GoodReads authors')
  22. s.pipeline([
  23. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_book_authors.json.gz'],
  24. ['psql', '-c', '\\copy gr_raw_author (gr_author_data) FROM STDIN']
  25. ])
  26. _log.info('Importing GoodReads interactions')
  27. s.pipeline([
  28. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_interactions.json.gz'],
  29. ['psql', '-c', '\\copy gr_raw_interaction (gr_int_data) FROM STDIN']
  30. ])
  31. s.finish('gr-data')
  32. @task(s.init)
  33. def index_books(c, force=False):
  34. "Index GoodReads book data"
  35. s.check_prereq('gr-data')
  36. s.start('gr-index-books', force=force)
  37. _log.info('building GoodReads indexes')
  38. s.psql(c, 'gr-index-books.sql')
  39. s.finish('gr-index-books')
  40. @task(s.init)
  41. def index_ratings(c, force=False):
  42. "Index GoodReads rating/interaction data"
  43. s.check_prereq('gr-data')
  44. s.check_prereq('cluster')
  45. s.start('gr-index-ratings', force=force)
  46. _log.info('building GoodReads indexes')
  47. s.psql(c, 'gr-index-ratings.sql')
  48. s.finish('gr-index-ratings')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...