graph.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. """
  2. Utiltiies for loading & working with the book identifier graph.
  3. """
  4. import logging
  5. import pandas as pd
  6. import numpy as np
  7. from graph_tool import Graph
  8. from .schema import *
  9. _log = logging.getLogger(__name__)
  10. class GraphLoader:
  11. cluster = None
  12. isbn_table = 'isbn_id'
  13. def set_cluster(self, cluster, cur):
  14. _log.info('restricting graph load to cluster %s', cluster)
  15. self.cluster = cluster
  16. self.isbn_table = 'gc_isbns'
  17. cur.execute('''
  18. CREATE TEMPORARY TABLE gc_isbns
  19. AS SELECT isbn_id, isbn
  20. FROM isbn_cluster JOIN isbn_id USING (isbn_id)
  21. WHERE cluster = %s
  22. ''', [self.cluster])
  23. def q_isbns(self):
  24. return f'SELECT isbn_id AS id, isbn FROM {self.isbn_table}'
  25. @property
  26. def limit(self):
  27. if self.isbn_table == 'isbn_id':
  28. return ''
  29. else:
  30. return f'JOIN {self.isbn_table} USING (isbn_id)'
  31. def q_loc_nodes(self, full=False):
  32. if full:
  33. return f'''
  34. SELECT DISTINCT rec_id AS id, title
  35. FROM locmds.book_rec_isbn {self.limit}
  36. LEFT JOIN locmds.book_title USING (rec_id)
  37. '''
  38. else:
  39. return f'''
  40. SELECT DISTINCT rec_id AS id
  41. FROM locmds.book_rec_isbn {self.limit}
  42. '''
  43. def q_loc_edges(self):
  44. return f'''
  45. SELECT isbn_id, rec_id
  46. FROM locmds.book_rec_isbn {self.limit}
  47. '''
  48. def q_ol_edition_nodes(self, full=False):
  49. if full:
  50. return f'''
  51. SELECT DISTINCT
  52. edition_id AS id, edition_key AS label,
  53. NULLIF(edition_data->>'title', '') AS title
  54. FROM ol.isbn_link {self.limit}
  55. JOIN ol.edition USING (edition_id)
  56. '''
  57. else:
  58. return f'''
  59. SELECT DISTINCT edition_id AS id
  60. FROM ol.isbn_link {self.limit}
  61. '''
  62. def q_ol_work_nodes(self, full=False):
  63. if full:
  64. return f'''
  65. SELECT DISTINCT
  66. work_id AS id, work_key AS label,
  67. NULLIF(work_data->>'title', '') AS title
  68. FROM ol.isbn_link {self.limit}
  69. JOIN ol.work USING (work_id)
  70. '''
  71. else:
  72. return f'''
  73. SELECT DISTINCT work_id AS id
  74. FROM ol.isbn_link {self.limit}
  75. WHERE work_id IS NOT NULL
  76. '''
  77. def q_ol_edition_edges(self):
  78. return f'''
  79. SELECT DISTINCT isbn_id, edition_id
  80. FROM ol.isbn_link {self.limit}
  81. '''
  82. def q_ol_work_edges(self):
  83. return f'''
  84. SELECT DISTINCT edition_id, work_id
  85. FROM ol.isbn_link {self.limit}
  86. WHERE work_id IS NOT NULL
  87. '''
  88. def q_gr_book_nodes(self, full=False):
  89. return f'''
  90. SELECT DISTINCT gr_book_id AS id
  91. FROM gr.book_isbn {self.limit}
  92. '''
  93. def q_gr_work_nodes(self, full=False):
  94. if full:
  95. return f'''
  96. SELECT DISTINCT gr_work_id AS id, work_title
  97. FROM gr.book_isbn {self.limit}
  98. JOIN gr.book_ids ids USING (gr_book_id)
  99. LEFT JOIN gr.work_title USING (gr_work_id)
  100. WHERE ids.gr_work_id IS NOT NULL
  101. '''
  102. else:
  103. return f'''
  104. SELECT DISTINCT gr_work_id AS id
  105. FROM gr.book_isbn {self.limit}
  106. JOIN gr.book_ids ids USING (gr_book_id)
  107. WHERE ids.gr_work_id IS NOT NULL
  108. '''
  109. def q_gr_book_edges(self, full=False):
  110. return f'''
  111. SELECT DISTINCT isbn_id, gr_book_id
  112. FROM gr.book_isbn {self.limit}
  113. '''
  114. def q_gr_work_edges(self):
  115. return f'''
  116. SELECT DISTINCT gr_book_id, gr_work_id
  117. FROM gr.book_isbn {self.limit}
  118. JOIN gr.book_ids ids USING (gr_book_id)
  119. WHERE ids.gr_work_id IS NOT NULL
  120. '''
  121. def load_minimal_graph(self, cxn):
  122. g = Graph(directed=False)
  123. codes = []
  124. sources = []
  125. def add_nodes(df, ns, src):
  126. n = len(df)
  127. _log.info('adding %d nodes to graph', n)
  128. start = g.num_vertices()
  129. vs = g.add_vertex(n)
  130. end = g.num_vertices()
  131. assert end - start == n
  132. nodes = pd.Series(np.arange(start, end, dtype='i4'), index=df['id'])
  133. codes.append(df['id'].values + ns)
  134. sources.append(np.full(n, src, dtype='i2'))
  135. return nodes
  136. def add_edges(f, src, dst):
  137. _log.info('adding %d edges to graph', len(f))
  138. edges = np.zeros((len(f), 2), dtype='i4')
  139. edges[:, 0] = src.loc[f.iloc[:, 0]]
  140. edges[:, 1] = dst.loc[f.iloc[:, 1]]
  141. g.add_edge_list(edges)
  142. _log.info('fetching ISBNs')
  143. isbns = pd.read_sql_query(self.q_isbns(), cxn)
  144. isbn_nodes = add_nodes(isbns.drop(columns=['isbn']), ns_isbn, 9)
  145. _log.info('fetching LOC records')
  146. loc_recs = pd.read_sql_query(self.q_loc_nodes(False), cxn)
  147. loc_nodes = add_nodes(loc_recs, ns_rec, 3)
  148. _log.info('fetching LOC ISBN links')
  149. loc_edges = pd.read_sql_query(self.q_loc_edges(), cxn)
  150. add_edges(loc_edges, isbn_nodes, loc_nodes)
  151. _log.info('fetching OL editions')
  152. ol_eds = pd.read_sql_query(self.q_ol_edition_nodes(False), cxn)
  153. ol_e_nodes = add_nodes(ol_eds, ns_edition, 2)
  154. _log.info('fetching OL works')
  155. ol_wks = pd.read_sql_query(self.q_ol_work_nodes(False), cxn)
  156. ol_w_nodes = add_nodes(ol_wks, ns_work, 1)
  157. _log.info('fetching OL ISBN edges')
  158. ol_ie_edges = pd.read_sql_query(self.q_ol_edition_edges(), cxn)
  159. add_edges(ol_ie_edges, isbn_nodes, ol_e_nodes)
  160. _log.info('fetching OL edition/work edges')
  161. ol_ew_edges = pd.read_sql_query(self.q_ol_work_edges(), cxn)
  162. add_edges(ol_ew_edges, ol_e_nodes, ol_w_nodes)
  163. _log.info('fetching GR books')
  164. gr_books = pd.read_sql_query(self.q_gr_book_nodes(False), cxn)
  165. gr_b_nodes = add_nodes(gr_books, ns_gr_book, 5)
  166. _log.info('fetching GR ISBN edges')
  167. gr_ib_edges = pd.read_sql_query(self.q_gr_book_edges(), cxn)
  168. add_edges(gr_ib_edges, isbn_nodes, gr_b_nodes)
  169. _log.info('fetching GR works')
  170. gr_works = pd.read_sql_query(self.q_gr_work_nodes(False), cxn)
  171. gr_w_nodes = add_nodes(gr_works, ns_gr_work, 4)
  172. _log.info('fetching GR work/edition edges')
  173. gr_bw_edges = pd.read_sql_query(self.q_gr_work_edges(), cxn)
  174. add_edges(gr_bw_edges, gr_b_nodes, gr_w_nodes)
  175. _log.info('setting code attributes')
  176. code_a = g.new_vp('int64_t')
  177. code_a.a[:] = np.concatenate(codes)
  178. g.vp['code'] = code_a
  179. _log.info('setting source attributes')
  180. source_a = g.new_vp('int16_t')
  181. source_a.a[:] = np.concatenate(sources)
  182. g.vp['source'] = source_a
  183. _log.info('imported %s', g)
  184. return g
Tip!

Press p or to see the previous file or, n or to see the next file