Browse Source

Use graph-tool for graph export

Michael Ekstrand 9 months ago
parent
commit
98cea55c71
3 changed files with 170 additions and 248 deletions
  1. 134
    53
      bookdata/graph.py
  2. 23
    18
      bookdata/schema.py
  3. 13
    177
      scripts/inspect-idgraph.py

+ 134
- 53
bookdata/graph.py

@@ -11,6 +11,111 @@ from .schema import *
 
 _log = logging.getLogger(__name__)
 
+class MinGraphBuilder:
+    def __init__(self):
+        self.graph = Graph(directed=False)
+        self.codes = []
+        self.sources = []
+
+    def add_nodes(self, df, ns):
+        n = len(df)
+        _log.info('adding %d nodes to graph', n)
+        start = self.graph.num_vertices()
+        vs = self.graph.add_vertex(n)
+        end = self.graph.num_vertices()
+        assert end - start == n
+        nodes = pd.Series(np.arange(start, end, dtype='i4'), index=df['id'])
+        self.codes.append(df['id'].values + ns.offset)
+        self.sources.append(np.full(n, ns.code, dtype='i2'))
+        return nodes
+
+    def add_edges(self, f, src, dst):
+        _log.info('adding %d edges to graph', len(f))
+        edges = np.zeros((len(f), 2), dtype='i4')
+        edges[:, 0] = src.loc[f.iloc[:, 0]]
+        edges[:, 1] = dst.loc[f.iloc[:, 1]]
+        self.graph.add_edge_list(edges)
+
+    def finish(self):
+        _log.info('setting code attributes')
+        code_a = self.graph.new_vp('int64_t')
+        code_a.a[:] = np.concatenate(self.codes)
+        self.graph.vp['code'] = code_a
+
+        _log.info('setting source attributes')
+        source_a = self.graph.new_vp('int16_t')
+        source_a.a[:] = np.concatenate(self.sources)
+        self.graph.vp['source'] = source_a
+
+        return self.graph
+
+
+class FullGraphBuilder:
+    def __init__(self):
+        self.graph = Graph(directed=False)
+        self.codes = []
+        self.sources = []
+        self.labels = []
+        self.attrs = set()
+
+    def add_nodes(self, df, ns):
+        n = len(df)
+        _log.info('adding %d nodes to graph', n)
+        start = self.graph.num_vertices()
+        vs = self.graph.add_vertex(n)
+        end = self.graph.num_vertices()
+        assert end - start == n
+        nodes = pd.Series(np.arange(start, end, dtype='i4'), index=df['id'])
+        self.codes.append(df['id'].values + ns.offset)
+        self.sources.append(np.full(n, ns.code, dtype='i2'))
+        if 'label' in df.columns:
+            self.labels += list(df['label'].values)
+        else:
+            self.labels += list(df['id'].astype('str').values)
+
+        for c in df.columns:
+            if c in ['id', 'label']:
+                continue
+            if c not in self.attrs:
+                vp = self.graph.new_vp('string')
+                self.graph.vp[c] = vp
+                self.attrs.add(c)
+            else:
+                vp = self.graph.vp[c]
+
+            for v, val in zip(vs, df[c].values):
+                vp[v] = val
+
+        return nodes
+
+    def add_edges(self, f, src, dst):
+        _log.info('adding %d edges to graph', len(f))
+        edges = np.zeros((len(f), 2), dtype='i4')
+        edges[:, 0] = src.loc[f.iloc[:, 0]]
+        edges[:, 1] = dst.loc[f.iloc[:, 1]]
+        self.graph.add_edge_list(edges)
+
+    def finish(self):
+        _log.info('setting code attributes')
+        code_a = self.graph.new_vp('int64_t')
+        code_a.a[:] = np.concatenate(self.codes)
+        self.graph.vp['code'] = code_a
+
+        _log.info('setting source attributes')
+        source_a = self.graph.new_vp('string')
+        for v, s in zip(self.graph.vertices(), np.concatenate(self.sources)):
+            source_a[v] = src_label_rev[s]
+        self.graph.vp['source'] = source_a
+
+        _log.info('setting source attributes')
+        label_a = self.graph.new_vp('string')
+        for v, l in zip(self.graph.vertices(), self.labels):
+            label_a[v] = l
+        self.graph.vp['label'] = label_a
+
+        return self.graph
+
+
 class GraphLoader:
     cluster = None
     isbn_table = 'isbn_id'
@@ -26,8 +131,11 @@ class GraphLoader:
             WHERE cluster = %s
         ''', [self.cluster])
 
-    def q_isbns(self):
-        return f'SELECT isbn_id AS id, isbn FROM {self.isbn_table}'
+    def q_isbns(self, full=True):
+        if full:
+            return f'SELECT isbn_id AS id, isbn FROM {self.isbn_table}'
+        else:
+            return f'SELECT isbn_id AS id FROM {self.isbn_table}'
 
     @property
     def limit(self):
@@ -108,7 +216,7 @@ class GraphLoader:
     def q_gr_work_nodes(self, full=False):
         if full:
             return f'''
-                SELECT DISTINCT gr_work_id AS id, work_title
+                SELECT DISTINCT gr_work_id AS id, work_title AS title
                 FROM gr.book_isbn {self.limit}
                 JOIN gr.book_ids ids USING (gr_book_id)
                 LEFT JOIN gr.work_title USING (gr_work_id)
@@ -136,84 +244,57 @@ class GraphLoader:
             WHERE ids.gr_work_id IS NOT NULL
         '''
 
-    def load_minimal_graph(self, cxn):
-        g = Graph(directed=False)
-        codes = []
-        sources = []
-
-        def add_nodes(df, ns, src):
-            n = len(df)
-            _log.info('adding %d nodes to graph', n)
-            start = g.num_vertices()
-            vs = g.add_vertex(n)
-            end = g.num_vertices()
-            assert end - start == n
-            nodes = pd.Series(np.arange(start, end, dtype='i4'), index=df['id'])
-            codes.append(df['id'].values + ns)
-            sources.append(np.full(n, src, dtype='i2'))
-            return nodes
-
-        def add_edges(f, src, dst):
-            _log.info('adding %d edges to graph', len(f))
-            edges = np.zeros((len(f), 2), dtype='i4')
-            edges[:, 0] = src.loc[f.iloc[:, 0]]
-            edges[:, 1] = dst.loc[f.iloc[:, 1]]
-            g.add_edge_list(edges)
+    def load_graph(self, cxn, full=False):
+        if full:
+            gb = FullGraphBuilder()
+        else:
+            gb = MinGraphBuilder()
 
         _log.info('fetching ISBNs')
-        isbns = pd.read_sql_query(self.q_isbns(), cxn)
-        isbn_nodes = add_nodes(isbns.drop(columns=['isbn']), ns_isbn, 9)
+        isbns = pd.read_sql_query(self.q_isbns(full), cxn)
+        isbn_nodes = gb.add_nodes(isbns.rename({'isbn': 'label'}), ns_isbn)
 
         _log.info('fetching LOC records')
-        loc_recs = pd.read_sql_query(self.q_loc_nodes(False), cxn)
-        loc_nodes = add_nodes(loc_recs, ns_rec, 3)
+        loc_recs = pd.read_sql_query(self.q_loc_nodes(full), cxn)
+        loc_nodes = gb.add_nodes(loc_recs, ns_loc_rec)
 
         _log.info('fetching LOC ISBN links')
         loc_edges = pd.read_sql_query(self.q_loc_edges(), cxn)
-        add_edges(loc_edges, isbn_nodes, loc_nodes)
+        gb.add_edges(loc_edges, isbn_nodes, loc_nodes)
 
         _log.info('fetching OL editions')
-        ol_eds = pd.read_sql_query(self.q_ol_edition_nodes(False), cxn)
-        ol_e_nodes = add_nodes(ol_eds, ns_edition, 2)
+        ol_eds = pd.read_sql_query(self.q_ol_edition_nodes(full), cxn)
+        ol_e_nodes = gb.add_nodes(ol_eds, ns_edition)
 
         _log.info('fetching OL works')
-        ol_wks = pd.read_sql_query(self.q_ol_work_nodes(False), cxn)
-        ol_w_nodes = add_nodes(ol_wks, ns_work, 1)
+        ol_wks = pd.read_sql_query(self.q_ol_work_nodes(full), cxn)
+        ol_w_nodes = gb.add_nodes(ol_wks, ns_work)
 
         _log.info('fetching OL ISBN edges')
         ol_ie_edges = pd.read_sql_query(self.q_ol_edition_edges(), cxn)
-        add_edges(ol_ie_edges, isbn_nodes, ol_e_nodes)
+        gb.add_edges(ol_ie_edges, isbn_nodes, ol_e_nodes)
 
         _log.info('fetching OL edition/work edges')
         ol_ew_edges = pd.read_sql_query(self.q_ol_work_edges(), cxn)
-        add_edges(ol_ew_edges, ol_e_nodes, ol_w_nodes)
+        gb.add_edges(ol_ew_edges, ol_e_nodes, ol_w_nodes)
 
         _log.info('fetching GR books')
-        gr_books = pd.read_sql_query(self.q_gr_book_nodes(False), cxn)
-        gr_b_nodes = add_nodes(gr_books, ns_gr_book, 5)
+        gr_books = pd.read_sql_query(self.q_gr_book_nodes(full), cxn)
+        gr_b_nodes = gb.add_nodes(gr_books, ns_gr_book)
 
         _log.info('fetching GR ISBN edges')
         gr_ib_edges = pd.read_sql_query(self.q_gr_book_edges(), cxn)
-        add_edges(gr_ib_edges, isbn_nodes, gr_b_nodes)
+        gb.add_edges(gr_ib_edges, isbn_nodes, gr_b_nodes)
 
         _log.info('fetching GR works')
-        gr_works = pd.read_sql_query(self.q_gr_work_nodes(False), cxn)
-        gr_w_nodes = add_nodes(gr_works, ns_gr_work, 4)
+        gr_works = pd.read_sql_query(self.q_gr_work_nodes(full), cxn)
+        gr_w_nodes = gb.add_nodes(gr_works, ns_gr_work)
 
         _log.info('fetching GR work/edition edges')
         gr_bw_edges = pd.read_sql_query(self.q_gr_work_edges(), cxn)
-        add_edges(gr_bw_edges, gr_b_nodes, gr_w_nodes)
-
-        _log.info('setting code attributes')
-        code_a = g.new_vp('int64_t')
-        code_a.a[:] = np.concatenate(codes)
-        g.vp['code'] = code_a
-
-        _log.info('setting source attributes')
-        source_a = g.new_vp('int16_t')
-        source_a.a[:] = np.concatenate(sources)
-        g.vp['source'] = source_a
+        gb.add_edges(gr_bw_edges, gr_b_nodes, gr_w_nodes)
 
+        g = gb.finish()
         _log.info('imported %s', g)
 
         return g

+ 23
- 18
bookdata/schema.py

@@ -4,23 +4,28 @@ Data schema information for the book data tools.
 
 import pandas as pd
 
-ns_work=100000000
-ns_edition=200000000
-ns_rec=300000000
-ns_gr_work=400000000
-ns_gr_book=500000000
-ns_loc_work=600000000
-ns_loc_instance=700000000
-ns_isbn=900000000
+class NS:
+    def __init__(self, name, num):
+        self.name = name
+        self.code = num
+        self.offset = num * 100000000
 
-src_labels = pd.Series({
-    'OL-W': 1,
-    'OL-E': 2,
-    'LOC': 3,
-    'GR-W': 4,
-    'GR-B': 5,
-    'LOC-W': 6,
-    'LOC-I': 7,
-    'ISBN': 9
-})
+ns_work = NS('OL-W', 1)
+ns_edition = NS('OL-E', 2)
+ns_loc_rec = NS('LOC', 3)
+ns_gr_work = NS('GR-W', 4)
+ns_gr_book = NS('GR-B', 5)
+ns_loc_work = NS('LOC-W', 6)
+ns_loc_instance = NS('LOC-I', 7)
+ns_isbn = NS('ISBN', 9)
+
+numspaces = [
+    ns_work, ns_edition,
+    ns_loc_rec,
+    ns_gr_work, ns_gr_book,
+    ns_loc_work, ns_loc_instance,
+    ns_isbn
+]
+
+src_labels = pd.Series(dict((_ns.name, _ns.code) for _ns in numspaces))
 src_label_rev = pd.Series(src_labels.index, index=src_labels.values)

+ 13
- 177
scripts/inspect-idgraph.py

@@ -29,104 +29,6 @@ from bookdata import tracking, db, script_log
 from bookdata.graph import GraphLoader
 
 
-class GMLWriter:
-    def __init__(self, out):
-        self.output = out
-        self._n_attrs = set(['id'])
-
-    def _p(self, code, *args):
-        print(code.format(*args), file=self.output)
-
-    def node_attr(self, name):
-        self._n_attrs.add(name)
-
-    def start(self):
-        self._p('graph [')
-        self._p('  directed 0')
-
-    def finish(self):
-        self._p(']')
-
-    def node(self, **attrs):
-        self._p('  node [')
-        for k, v in attrs.items():
-            if k not in self._n_attrs:
-                raise RuntimeError('unknown node attribute ' + k)
-            if k == 'label':
-                v = str(v)
-            if v is not None:
-                self._p('    {} {}', k, json.dumps(v))
-        self._p('  ]')
-
-    def edge(self, **attrs):
-        self._p('  edge [')
-        for k, v in attrs.items():
-            if v is not None:
-                self._p('    {} {}', k, json.dumps(v))
-        self._p('  ]')
-
-
-class GraphMLWriter:
-    _g_started = False
-
-    def __init__(self, out):
-        self.output = out
-        self.tb = etree.TreeBuilder()
-        self._ec = 0
-
-    def node_attr(self, name, type='string'):
-        self.tb.start('key', {
-            'id': name,
-            'for': 'node',
-            'attr.name': name,
-            'attr.type': type
-        })
-        self.tb.end('key')
-
-    def start(self):
-        self.tb.start('graphml', {
-            'xmlns': 'http://graphml.graphdrawing.org/xmlns',
-            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': d('''
-                http://graphml.graphdrawing.org/xmlns
-                http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd
-            ''').strip(),
-        })
-
-    def finish(self):
-        self.tb.end('graph')
-        self.tb.end('graphml')
-        elt = self.tb.close()
-        tree = etree.ElementTree(elt)
-        tree.write(self.output, encoding='unicode')
-
-    def node(self, id, **attrs):
-        if not self._g_started:
-            self.tb.start('graph', {
-                'edgedefault': 'undirected'
-            })
-            self._g_started = True
-
-        self.tb.start('node', {
-            'id': id
-        })
-        for k, v in attrs.items():
-            if v is not None:
-                self.tb.start('data', {'key': k})
-                self.tb.data(str(v))
-                self.tb.end('data')
-        self.tb.end('node')
-
-    def edge(self, source, target):
-        self._ec += 1
-        eid = self._ec
-        self.tb.start('edge', {
-            'id': f'e{eid}',
-            'source': source,
-            'target': target
-        })
-        self.tb.end('edge')
-
-
 def stats(dbc, out, opts):
     "Compute statistics of the clustering"
     with dbc.cursor() as cur:
@@ -190,83 +92,18 @@ def records(dbc, out, opts):
     bc_recs.to_csv(out, index=False)
 
 
-def graph(dbc, out, opts):
+def graph(opts):
     cluster = opts['CLUSTER']
     _log.info('exporting graph for cluster %s', cluster)
 
-    format = opts.get('--format', 'gml')
-    if format == 'gml':
-        gw = GMLWriter(out)
-    elif format == 'graphml':
-        gw = GraphMLWriter(out)
-    else:
-        raise ValueError('invalid format ' + format)
-    gw.start()
-    gw.node_attr('label')
-    gw.node_attr('category')
-    gw.node_attr('title')
-
     gl = GraphLoader()
+    with db.engine().connect() as cxn:
+        gl.set_cluster(cluster, cxn)
+        g = gl.load_graph(cxn, True)
 
-    with dbc.cursor() as cur:
-        gl.set_cluster(cluster, dbc)
-
-        _log.info('fetching ISBNs')
-        cur.execute(gl.q_isbns())
-        for iid, isbn in cur:
-            gw.node(id=f'i{iid}', label=isbn, category='ISBN')
-
-        _log.info('fetching LOC records')
-        cur.execute(gl.q_loc_nodes(True))
-        for rid, title in cur:
-            gw.node(id=f'l{rid}', label=rid, category='LOC', title=title)
-
-        _log.info('fetching LOC ISBN links')
-        cur.execute(gl.q_loc_edges())
-        for iid, rid in cur:
-            gw.edge(source=f'l{rid}', target=f'i{iid}')
-
-        _log.info('fetching OL editions')
-        cur.execute(gl.q_ol_edition_nodes(True))
-        for eid, ek, e_title in cur:
-            gw.node(id=f'ole{eid}', label=ek, category='OLE', title=e_title)
-
-        _log.info('fetching OL works')
-        cur.execute(gl.q_ol_work_nodes(True))
-        for wid, wk, w_title in cur:
-            gw.node(id=f'olw{wid}', label=wk, category='OLW', title=w_title)
-
-        _log.info('fetching OL ISBN edges')
-        cur.execute(gl.q_ol_edition_edges())
-        for iid, eid in cur:
-            gw.edge(source=f'ole{eid}', target=f'i{iid}')
-
-        _log.info('fetching OL edition/work edges')
-        cur.execute(gl.q_ol_work_edges())
-        for eid, wid in cur:
-            gw.edge(source=f'ole{eid}', target=f'olw{wid}')
-
-        _log.info('fetching GR books')
-        cur.execute(gl.q_gr_book_edges())
-        bids = set()
-        for iid, bid in cur:
-            if bid not in bids:
-                gw.node(id=f'grb{bid}', label=bid, category='GRB')
-                bids.add(bid)
-            gw.edge(source=f'grb{bid}', target=f'i{iid}')
-
-        _log.info('fetching GR works')
-        cur.execute(gl.q_gr_work_nodes(True))
-        for wid, title in cur:
-            gw.node(id=f'grw{wid}', label=wid, category='GRW', title=title)
-
-        _log.info('fetching GR work/edition edges')
-        cur.execute(gl.q_gr_work_edges())
-        for bid, wid in cur:
-            gw.edge(source=f'grw{wid}', target=f'grb{bid}')
-
-    gw.finish()
-    _log.info('exported graph')
+    ofn = opts['-o']
+    _log.info('saving graph to %s', ofn)
+    g.save(ofn)
 
 
 def full_graph(opts):
@@ -283,18 +120,17 @@ def full_graph(opts):
 _log = script_log(__name__)
 opts = docopt(__doc__)
 
-if opts['-o']:
-    out = open(opts['-o'], 'w', encoding='utf8')
-else:
-    out = sys.stdout
-
 if opts['--full-graph']:
     full_graph(opts)
+elif opts['--graph']:
+    graph(opts)
 else:
+    if opts['-o']:
+        out = open(opts['-o'], 'w', encoding='utf8')
+    else:
+        out = sys.stdout
     with db.connect() as dbc:
         if opts['--stats']:
             stats(dbc, out, opts)
         elif opts['--records']:
             records(dbc, out, opts)
-        elif opts['--graph']:
-            graph(dbc, out, opts)