Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

create_sub_coco.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  1. from pathlib import Path
  2. import json
  3. import shutil
  4. import argparse
  5. def copy_train_val_to_new_dir(input_data_dir: str, dest_data_dir: str):
  6. """Create a subset of Coco2017 that runs on our Coco dataset. Ignore fields that are note useful
  7. :input_data_dir: Where the original data is stored
  8. :dest_data_dir: Where the resulting data should be stored
  9. """
  10. input_data_dir = Path(input_data_dir)
  11. dest_data_dir = Path(dest_data_dir)
  12. _copy_to_new_dir("train", 1000, input_data_dir, dest_data_dir)
  13. _copy_to_new_dir("val", 500, input_data_dir, dest_data_dir)
  14. def _copy_to_new_dir(mode: str, n_images: int, input_data_dir: Path, dest_data_dir: Path):
  15. """Copy either train or val from input dir into destination dir
  16. :param mode: Either "train" or "val"
  17. :param n_images: How many images/annotations to copy for this mode
  18. :input_data_dir: Where the original data is stored
  19. :dest_data_dir: Where the resulting data should be stored
  20. """
  21. input_instances_path = input_data_dir / "annotations" / f"instances_{mode}2017.json"
  22. dest_annotation_folder = dest_data_dir / "annotations"
  23. dest_annotation_folder.mkdir(exist_ok=True, parents=True)
  24. dest_instances_path = dest_annotation_folder / f"instances_{mode}2017.json"
  25. with open(input_instances_path, "r") as f:
  26. instances = json.load(f)
  27. image_ids = {instance["id"] for instance in instances["images"]}
  28. annotation_image_ids = {instance["image_id"] for instance in instances["annotations"]}
  29. kept_image_ids = list(image_ids & annotation_image_ids)[: n_images] # Make sure that the ids taken include both image and annotation
  30. kept_annotations = [image for image in instances["annotations"] if image["image_id"] in kept_image_ids]
  31. kept_images = [image for image in instances["images"] if image["id"] in kept_image_ids]
  32. instances["images"] = kept_images
  33. instances["annotations"] = kept_annotations
  34. kept_images_name = [image["file_name"] for image in instances["images"]]
  35. input_images_dir = input_data_dir / "images" / f"{mode}2017"
  36. dest_images_dir = dest_data_dir / "images" / f"{mode}2017"
  37. dest_images_dir.mkdir(exist_ok=True, parents=True)
  38. with open(dest_instances_path, "w") as f:
  39. json.dump(instances, f)
  40. for image_name in kept_images_name:
  41. shutil.copy(str(input_images_dir / image_name), str(dest_images_dir / image_name))
  42. if __name__ == "__main__":
  43. parser = argparse.ArgumentParser(description='Extract a sub set of Coco into specified dir')
  44. parser.add_argument('--input_data_dir', help='Where the full coco dataset is stored', default="/data/coco")
  45. parser.add_argument('--dest_data_dir', help='Where the resulting data should be stored', required=True)
  46. args = parser.parse_args()
  47. copy_train_val_to_new_dir(args.input_data_dir, args.dest_data_dir)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...