1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
- import logging
- from invoke import task
- import support as s
- _log = logging.getLogger(__name__)
- @task(s.init, s.build, name='import')
- def import_data(c, force=False):
- "Import GoodReads rating and book data"
- s.start('gr-data', force=force)
- _log.info('Resetting GoodReads schema')
- s.psql(c, 'gr-schema.sql')
- _log.info('Importing GoodReads books')
- s.pipeline([
- [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'book', s.data_dir / 'goodreads_books.json.gz']
- ])
- _log.info('Importing GoodReads works')
- s.pipeline([
- [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'work', s.data_dir / 'goodreads_book_works.json.gz']
- ])
- _log.info('Importing GoodReads authors')
- s.pipeline([
- [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'author', s.data_dir / 'goodreads_book_authors.json.gz']
- ])
- _log.info('Importing GoodReads interactions')
- s.pipeline([
- [s.bin_dir / 'import-json', '--truncate', 'goodreads', 'interaction', s.data_dir / 'goodreads_interactions.json.gz']
- ])
- s.finish('gr-data')
- @task(s.init)
- def index_books(c, force=False):
- "Index GoodReads book data"
- s.check_prereq('gr-data')
- s.start('gr-index-books', force=force)
- _log.info('building GoodReads indexes')
- s.psql(c, 'gr-index-books.sql', True)
- s.finish('gr-index-books')
- @task(s.init)
- def index_ratings(c, force=False):
- "Index GoodReads rating/interaction data"
- s.check_prereq('gr-data')
- s.check_prereq('cluster')
- s.start('gr-index-ratings', force=force)
- _log.info('building GoodReads rating data indexes')
- s.psql(c, 'gr-index-ratings.sql', True)
- s.finish('gr-index-ratings')
- @task(s.init, s.build)
- def record_files(c):
- files = [s.data_dir / f'goodreads_{f}.json.gz' for f in ['books', 'book_works', 'book_authors', 'interactions']]
- s.booktool(c, 'hash', *files)
|