Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

list-files.py 1.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  1. """
  2. Usage:
  3. scripts.loc.list-files URL DIR
  4. """
  5. import requests
  6. import re
  7. from pathlib import Path
  8. import html5lib
  9. from bookdata import script_log
  10. from docopt import docopt
  11. _log = script_log(__name__)
  12. args = docopt(__doc__)
  13. url = args.get('URL')
  14. out_dir = Path(args.get('DIR'))
  15. _log.info('fetching %s', url)
  16. res = requests.get(url)
  17. tree = html5lib.parse(res.text)
  18. url_re = re.compile(r'^(?P<name>[A-Za-z.]+)\.(?P<year>\d+)\.part(?P<part>\d+)\.xml\.gz$')
  19. links = {}
  20. for link in tree.findall('.//{http://www.w3.org/1999/xhtml}a'):
  21. href = link.get('href')
  22. _log.debug('checking link %s', href)
  23. lm = url_re.match(href)
  24. if lm is not None:
  25. file = lm['name']
  26. year = lm['year']
  27. fn = lm[0]
  28. key = f'{file}.{year}'
  29. if key not in links:
  30. links[key] = []
  31. links[key].append(fn)
  32. out_dir.mkdir(exist_ok=True, parents=True)
  33. for key, files in links.items():
  34. _log.info('writing %d files for %s', len(files), key)
  35. kf = out_dir / f'{key}.lst'
  36. with kf.open('w') as f:
  37. for fn in files:
  38. f.write(f'{url}{fn}\n')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...