Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

ratings.py 2.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  1. from io import StringIO
  2. import csv
  3. import subprocess as sp
  4. import numpy as np
  5. from tqdm import tqdm
  6. import psycopg2
  7. from invoke import task
  8. import support as s
  9. @task
  10. def import_bx(c):
  11. "Import BookCrossing ratings"
  12. print("initializing BX schema")
  13. c.run('psql -f bx-schema.sql')
  14. print("cleaning BX rating data")
  15. with open('data/BX-Book-Ratings.csv', 'rb') as bf:
  16. data = bf.read()
  17. barr = np.frombuffer(data, dtype='u1')
  18. # delete bytes that are too big
  19. barr = barr[barr < 128]
  20. # convert to LF
  21. barr = barr[barr != ord('\r')]
  22. # change delimiter to comma
  23. barr[barr == ord(';')] = ord(',')
  24. # write
  25. print('importing BX to database')
  26. data = bytes(barr)
  27. rd = StringIO(data.decode('utf8'))
  28. dbc = psycopg2.connect("")
  29. try:
  30. # with dbc encapsulates a transaction
  31. with dbc, dbc.cursor() as cur:
  32. for row in tqdm(csv.DictReader(rd)):
  33. cur.execute('INSERT INTO bx_raw_ratings (user_id, isbn, rating) VALUES (%s, %s, %s)',
  34. (row['User-ID'], row['ISBN'], row['Book-Rating']))
  35. finally:
  36. dbc.close()
  37. @task(s.build)
  38. def import_az(c):
  39. "Import Amazon ratings"
  40. print('Resetting Amazon schema')
  41. c.run('psql -f az-schema.sql')
  42. print('Importing Amazon ratings')
  43. s.pipeline([
  44. [s.bin_dir / 'pcat', s.data_dir / 'ratings_Books.csv'],
  45. ['psql', '-c', '\\copy az_raw_ratings FROM STDIN (FORMAT CSV)']
  46. ])
  47. @task(s.build)
  48. def import_gr(c):
  49. "Import GoodReads ratings"
  50. print('Resetting GoodReads schema')
  51. c.run('psql -f gr-schema.sql')
  52. print('Importing GoodReads books')
  53. s.pipeline([
  54. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_books.json.gz'],
  55. ['psql', '-c', '\\copy gr_raw_book (gr_book_data) FROM STDIN']
  56. ])
  57. print('Importing GoodReads works')
  58. s.pipeline([
  59. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_book_works.json.gz'],
  60. ['psql', '-c', '\\copy gr_raw_work (gr_work_data) FROM STDIN']
  61. ])
  62. print('Importing GoodReads authors')
  63. s.pipeline([
  64. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_book_authors.json.gz'],
  65. ['psql', '-c', '\\copy gr_raw_author (gr_author_data) FROM STDIN']
  66. ])
  67. print('Importing GoodReads interactions')
  68. s.pipeline([
  69. [s.bin_dir / 'clean-json', s.data_dir / 'goodreads_interactions.json.gz'],
  70. ['psql', '-c', '\\copy gr_raw_interaction (gr_int_data) FROM STDIN']
  71. ])
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...