Are you sure you want to delete this access key?
import pandas as pd
import numpy as np
from numba import njit
%matplotlib inline
db_url = 'postgresql://openlib:piratelib@localhost/openlib'
numspaces = dict(work=100000000, edition=200000000, rec=300000000, isbn=900000000)
We cluster ISBNs by taking the bipartite graph of ISBNs and records, and computing the closure for each ISBN. Each closure becomes a cluster with a single ‘book’ ID.
@njit
def _make_clusters(clusters, ls, rs):
iters = 0
nchanged = len(ls)
while nchanged > 0:
nchanged = 0
iters = iters + 1
for i in range(len(ls)):
left = ls[i]
right = rs[i]
if clusters[left] < clusters[right]:
clusters[right] = clusters[left]
nchanged += 1
return iters
def cluster_isbns(isbn_recs):
print('initializing isbn vector')
isbns = isbn_recs.groupby('isbn_id').record.min()
isbns = isbns.reset_index(name='cluster')
isbns['ino'] = np.arange(len(isbns), dtype=np.int32)
intbl = pd.merge(isbn_recs, isbns.loc[:, ['isbn_id', 'ino']])
left = intbl.loc[:, ['record', 'ino']].rename(columns={'ino': 'left'})
right = intbl.loc[:, ['record', 'ino']].rename(columns={'ino': 'right'})
print('making edge table')
edges = pd.merge(left, right)
print('clustering')
iters = _make_clusters(isbns.cluster.values, edges.left.values, edges.right.values)
print('clustered in', iters, 'iterations')
return isbns
def plot_cluster_sizes(clusters):
cluster_sizes = clusters.groupby('cluster').isbn_id.count()
size_acc = cluster_sizes.reset_index(name='size').groupby('size').cluster.count()
size_acc = size_acc.reset_index(name='nclusters')
return size_acc.plot.scatter(x='size', y='nclusters', loglog=True)
loc_rec_isbns = pd.read_sql('''
SELECT isbn_id, rec_id AS record
FROM loc_rec_isbn
''', db_url)
loc_rec_isbns.head()
loc_clusters = cluster_isbns(loc_rec_isbns)
plot_cluster_sizes(loc_clusters)
loc_clusters.to_csv('data/loc-clusters.csv', index=False, header=False)
ol_rec_edges = pd.read_sql('''
SELECT isbn_id, book_code AS record
FROM ol_isbn_link
''', db_url)
ol_clusters = cluster_isbns(ol_rec_edges)
plot_cluster_sizes(ol_clusters)
ol_clusters.to_csv('data/ol-clusters.csv', index=False, header=False)
all_isbn_recs = pd.concat([
loc_rec_isbns.assign(record=lambda df: df.record + numspaces['rec']),
ol_rec_edges
])
int_clusters = cluster_isbns(all_isbn_recs)
plot_cluster_sizes(int_clusters)
int_clusters.to_csv('data/isbn-clusters.csv', index=False, header=False)
Press p or to see the previous file or, n or to see the next file
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?