Browse Source

Download LOC file listing

Michael Ekstrand 1 year ago
parent
commit
218ea6b584

+ 1
- 0
.dvc/.gitignore

@@ -6,3 +6,4 @@
 /state-wal
 /state
 /cache
+/tmp

+ 44
- 0
bookdata/__init__.py

@@ -0,0 +1,44 @@
+import sys
+import pathlib
+import logging
+
+_simple_format = logging.Formatter('{asctime} [{levelname:7s}] {name} {message}',
+                                   datefmt='%Y-%m-%d %H:%M:%S',
+                                   style='{')
+
+_initialized = False
+
+
+def setup(debug=False):
+    global _initialized
+    ch = logging.StreamHandler(sys.stderr)
+    ch.setLevel(logging.DEBUG if debug else logging.INFO)
+    ch.setFormatter(_simple_format)
+
+    root = logging.getLogger()
+    root.addHandler(ch)
+    root.setLevel(logging.INFO)
+
+    logging.getLogger('dvc').setLevel(logging.ERROR)
+    logging.getLogger('lenskit').setLevel(logging.DEBUG)
+    logging.getLogger('').setLevel(logging.DEBUG)
+    root.debug('log system configured')
+    _initialized = True
+
+
+def script_log(name, debug=False):
+    """
+    Initialize logging and get a logger for a script.
+
+    Args:
+        name(str): The ``__file__`` of the script being run.
+        debug(bool): whether to enable debug logging to the console
+    """
+
+    if not _initialized:
+        setup(debug)
+
+    name = pathlib.Path(name).stem
+    logger = logging.getLogger(name)
+
+    return logger

+ 1
- 0
data/.gitignore

@@ -12,3 +12,4 @@
 /BX-Books.csv
 /BX-Users.csv
 /viaf-20181104-clusters-marc21.xml.gz
+/loc-listings

+ 10
- 0
data/loc-listings.dvc

@@ -0,0 +1,10 @@
+md5: a5e6fb8537ea047d6f2503418e8d9888
+cmd: python -m scripts.loc.list-files https://www.loc.gov/cds/downloads/MDSConnect/
+  data/loc-listings
+wdir: ..
+outs:
+- md5: a758253fe6f30d4adad8b208082fc517.dir
+  path: data/loc-listings
+  cache: true
+  metric: false
+  persist: false

+ 2
- 0
environment.yml

@@ -19,6 +19,8 @@ dependencies:
 - sqlalchemy
 - rust>=1.32
 - postgresql>=10
+- requests
+- html5lib
 # DVC deps
 - contextlib2
 - gitpython

+ 0
- 0
scripts/__init__.py

+ 0
- 0
scripts/loc/__init__.py

+ 48
- 0
scripts/loc/list-files.py

@@ -0,0 +1,48 @@
+"""
+Usage:
+    scripts.loc.list-files URL DIR
+"""
+
+import requests
+import re
+from pathlib import Path
+import html5lib
+
+from bookdata import script_log
+from docopt import docopt
+
+_log = script_log(__file__)
+
+args = docopt(__doc__)
+
+url = args.get('URL')
+out_dir = Path(args.get('DIR'))
+
+_log.info('fetching %s', url)
+res = requests.get(url)
+
+tree = html5lib.parse(res.text)
+url_re = re.compile(r'^(?P<name>[A-Za-z.]+)\.(?P<year>\d+)\.part(?P<part>\d+)\.xml\.gz$')
+
+links = {}
+
+for link in tree.findall('.//{http://www.w3.org/1999/xhtml}a'):
+    href = link.get('href')
+    _log.debug('checking link %s', href)
+    lm = url_re.match(href)
+    if lm is not None:
+        file = lm['name']
+        year = lm['year']
+        fn = lm[0]
+        key = f'{file}.{year}'
+        if key not in links:
+            links[key] = []
+        links[key].append(fn)
+
+out_dir.mkdir(exist_ok=True, parents=True)
+for key, files in links.items():
+    _log.info('writing %d files for %s', len(files), key)
+    kf = out_dir / f'{key}.lst'
+    with kf.open('w') as f:
+        for fn in files:
+            f.write(f'{url}{fn}\n')