Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

bx-import.py 1.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  1. """
  2. Import BookCrossing ratings, with data cleaning for invalid characters.
  3. Usage:
  4. bx-import.py [-T <file>] <file>
  5. Options:
  6. -T FILE
  7. Write transcript to FILE [default: bx-import.transcript]
  8. """
  9. import hashlib
  10. from bookdata import script_log, db, tracking
  11. from docopt import docopt
  12. import numpy as np
  13. from tqdm import tqdm
  14. from io import StringIO
  15. import csv
  16. _log = script_log(__name__)
  17. opts = docopt(__doc__)
  18. src_file = opts.get('<file>')
  19. tx_file = open(opts.get('-T'), 'w')
  20. _log.info("cleaning BX rating data")
  21. with open(src_file, 'rb') as bf:
  22. data = bf.read()
  23. in_chk = hashlib.sha1(data).hexdigest()
  24. barr = np.frombuffer(data, dtype='u1')
  25. # delete bytes that are too big
  26. barr = barr[barr < 128]
  27. # convert to LF
  28. barr = barr[barr != ord('\r')]
  29. # change delimiter to comma
  30. barr[barr == ord(';')] = ord(',')
  31. # write
  32. _log.info('importing BX to database')
  33. data = bytes(barr)
  34. rd = StringIO(data.decode('utf8'))
  35. with db.connect() as dbc:
  36. print('IMPORT TO bx.raw_ratings', file=tx_file)
  37. print('READ', src_file, in_chk, file=tx_file)
  38. # we're going to hash the data we insert
  39. dh = hashlib.md5()
  40. # with dbc encapsulates a transaction
  41. with dbc, dbc.cursor() as cur:
  42. tracking.begin_stage(cur, 'bx-ratings')
  43. tracking.record_file(cur, src_file, in_chk, 'bx-ratings')
  44. tracking.record_dep(cur, 'bx-ratings', 'bx-schema')
  45. n = 0
  46. for row in tqdm(csv.DictReader(rd)):
  47. uid = row['User-ID']
  48. isbn = row['ISBN']
  49. rating = row['Book-Rating']
  50. cur.execute('INSERT INTO bx.raw_ratings (user_id, isbn, rating) VALUES (%s, %s, %s)',
  51. (uid, isbn, rating))
  52. dh.update(f'{uid}\t{isbn}\t{rating}\n'.encode('utf8'))
  53. n += 1
  54. tracking.end_stage(cur, 'bx-ratings', key=dh.hexdigest())
  55. print('INSERTED', n, dh.hexdigest(), file=tx_file)
  56. tx_file.close()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...