Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

mergedatasets.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
  1. import argparse
  2. import re
  3. import shutil
  4. import sys
  5. from pathlib import Path
  6. from deadtrees.data.deadtreedata import DeadtreeDatasetConfig, split_shards
  7. def main():
  8. parser = argparse.ArgumentParser()
  9. parser.add_argument("indirs", type=Path, nargs="+")
  10. parser.add_argument(
  11. "--outdir",
  12. dest="outdir",
  13. type=Path,
  14. default=Path("data/dataset"),
  15. help="output directory for merged dataset",
  16. )
  17. args = parser.parse_args()
  18. args.outdir.mkdir(parents=True, exist_ok=True)
  19. if len(args.indirs) < 2:
  20. print("At least two indirs are required!\n")
  21. parser.print_help()
  22. sys.exit(1)
  23. # find year in path str
  24. years = [re.search(r"\d{4}", str(d)) for d in args.indirs]
  25. years_extracted = [y.group() for y in years if y]
  26. if len(years_extracted) != len(args.indirs):
  27. print("Extracting year info from indirs failed!\n")
  28. parser.print_help()
  29. sys.exit(1)
  30. # create train, validation, test folders
  31. (args.outdir / "train").mkdir(parents=True, exist_ok=True)
  32. (args.outdir / "val").mkdir(parents=True, exist_ok=True)
  33. (args.outdir / "test").mkdir(parents=True, exist_ok=True)
  34. for year, indir in zip(years_extracted, args.indirs):
  35. def copy_to_dst(files, subdir):
  36. for infile in files:
  37. infile = Path(infile)
  38. f = infile.name.split("-0")
  39. outfile = args.outdir / subdir / f"{f[0]}-{year}-0{f[1]}"
  40. shutil.copyfile(str(infile), str(outfile))
  41. train_files, val_files, test_files = split_shards(
  42. sorted(indir.glob("*.tar")), DeadtreeDatasetConfig.fractions
  43. )
  44. copy_to_dst(train_files, "train")
  45. copy_to_dst(val_files, "val")
  46. copy_to_dst(test_files, "test")
  47. if __name__ == "__main__":
  48. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...