Browse Source

Use Curl instead of wget/aria2 for downloads

Michael Ekstrand 6 months ago
parent
commit
f27edd8757
7 changed files with 7 additions and 64 deletions
  1. 0
    1
      data/.gitignore
  2. 3
    4
      data/loc-books.dvc
  3. 0
    7
      data/loc-listings.dvc
  4. 3
    4
      data/loc-names.dvc
  5. 1
    0
      environment.yml
  6. 0
    0
      scripts/loc/__init__.py
  7. 0
    48
      scripts/loc/list-files.py

+ 0
- 1
data/.gitignore

@@ -10,7 +10,6 @@
 /BX-Book-Ratings.csv
 /BX-Books.csv
 /BX-Users.csv
-/loc-listings
 /loc-books
 /loc-names
 /viaf-clusters-marc21.xml.gz

+ 3
- 4
data/loc-books.dvc

@@ -1,7 +1,6 @@
-cmd: aria2c -d loc-books -i loc-listings/BooksAll.2016.lst
-deps:
-- path: loc-listings/BooksAll.2016.lst
-  md5: 695c2898d88df8eb50f086488a492433
+# Update from https://www.loc.gov/cds/products/MDSConnect-books_all.html when necessary
+cmd: curl https://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2016.part[01-43].xml.gz
+  --parallel --parallel-max 4 -o "loc-books/BooksAll.2016.part#1.xml.gz" --create-dirs
 outs:
 - path: loc-books
   md5: 1b1e7ab1d98cc81e373dfc53345d4bb7.dir

+ 0
- 7
data/loc-listings.dvc

@@ -1,7 +0,0 @@
-md5: 9b1ae0e4404499438b43a7d9276ca520
-cmd: python run.py loc.list-files https://www.loc.gov/cds/downloads/MDSConnect/ data/loc-listings
-wdir: ..
-outs:
-- md5: a758253fe6f30d4adad8b208082fc517.dir
-  path: data/loc-listings
-frozen: true

+ 3
- 4
data/loc-names.dvc

@@ -1,7 +1,6 @@
-cmd: wget -P loc-names -i loc-listings/Names.2016.lst
-deps:
-- path: loc-listings/Names.2016.lst
-  md5: c041ad93309f581e602f54bf4994ef78
+# Update from https://www.loc.gov/cds/products/MDSConnect-name_authorities.html when necessary
+cmd: curl https://www.loc.gov/cds/downloads/MDSConnect/Names.2016.part[01-40].xml.gz
+  --parallel --parallel-max 4 -o "loc-names/Names.2016.part#1.xml.gz" --create-dirs
 outs:
 - path: loc-names
   md5: fc488a8775561070cced774803fe0d72.dir

+ 1
- 0
environment.yml

@@ -24,6 +24,7 @@ dependencies:
 - ipywidgets
 - requests
 - html5lib
+- curl
 - gitpython
 - dvc>=1.1
 - boto3

+ 0
- 0
scripts/loc/__init__.py

+ 0
- 48
scripts/loc/list-files.py

@@ -1,48 +0,0 @@
-"""
-Usage:
-    scripts.loc.list-files URL DIR
-"""
-
-import requests
-import re
-from pathlib import Path
-import html5lib
-
-from bookdata import script_log
-from docopt import docopt
-
-_log = script_log(__name__)
-
-args = docopt(__doc__)
-
-url = args.get('URL')
-out_dir = Path(args.get('DIR'))
-
-_log.info('fetching %s', url)
-res = requests.get(url)
-
-tree = html5lib.parse(res.text)
-url_re = re.compile(r'^(?P<name>[A-Za-z.]+)\.(?P<year>\d+)\.part(?P<part>\d+)\.xml\.gz$')
-
-links = {}
-
-for link in tree.findall('.//{http://www.w3.org/1999/xhtml}a'):
-    href = link.get('href')
-    _log.debug('checking link %s', href)
-    lm = url_re.match(href)
-    if lm is not None:
-        file = lm['name']
-        year = lm['year']
-        fn = lm[0]
-        key = f'{file}.{year}'
-        if key not in links:
-            links[key] = []
-        links[key].append(fn)
-
-out_dir.mkdir(exist_ok=True, parents=True)
-for key, files in links.items():
-    _log.info('writing %d files for %s', len(files), key)
-    kf = out_dir / f'{key}.lst'
-    with kf.open('w') as f:
-        for fn in files:
-            f.write(f'{url}{fn}\n')