Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

pcam.py 5.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
  1. import pathlib
  2. from typing import Any, Callable, Optional, Tuple, Union
  3. from PIL import Image
  4. from .utils import _decompress, download_file_from_google_drive, verify_str_arg
  5. from .vision import VisionDataset
  6. class PCAM(VisionDataset):
  7. """`PCAM Dataset <https://github.com/basveeling/pcam>`_.
  8. The PatchCamelyon dataset is a binary classification dataset with 327,680
  9. color images (96px x 96px), extracted from histopathologic scans of lymph node
  10. sections. Each image is annotated with a binary label indicating presence of
  11. metastatic tissue.
  12. This dataset requires the ``h5py`` package which you can install with ``pip install h5py``.
  13. Args:
  14. root (str or ``pathlib.Path``): Root directory of the dataset.
  15. split (string, optional): The dataset split, supports ``"train"`` (default), ``"test"`` or ``"val"``.
  16. transform (callable, optional): A function/transform that takes in a PIL image and returns a transformed
  17. version. E.g, ``transforms.RandomCrop``.
  18. target_transform (callable, optional): A function/transform that takes in the target and transforms it.
  19. download (bool, optional): If True, downloads the dataset from the internet and puts it into ``root/pcam``. If
  20. dataset is already downloaded, it is not downloaded again.
  21. .. warning::
  22. To download the dataset `gdown <https://github.com/wkentaro/gdown>`_ is required.
  23. """
  24. _FILES = {
  25. "train": {
  26. "images": (
  27. "camelyonpatch_level_2_split_train_x.h5", # Data file name
  28. "1Ka0XfEMiwgCYPdTI-vv6eUElOBnKFKQ2", # Google Drive ID
  29. "1571f514728f59376b705fc836ff4b63", # md5 hash
  30. ),
  31. "targets": (
  32. "camelyonpatch_level_2_split_train_y.h5",
  33. "1269yhu3pZDP8UYFQs-NYs3FPwuK-nGSG",
  34. "35c2d7259d906cfc8143347bb8e05be7",
  35. ),
  36. },
  37. "test": {
  38. "images": (
  39. "camelyonpatch_level_2_split_test_x.h5",
  40. "1qV65ZqZvWzuIVthK8eVDhIwrbnsJdbg_",
  41. "d8c2d60d490dbd479f8199bdfa0cf6ec",
  42. ),
  43. "targets": (
  44. "camelyonpatch_level_2_split_test_y.h5",
  45. "17BHrSrwWKjYsOgTMmoqrIjDy6Fa2o_gP",
  46. "60a7035772fbdb7f34eb86d4420cf66a",
  47. ),
  48. },
  49. "val": {
  50. "images": (
  51. "camelyonpatch_level_2_split_valid_x.h5",
  52. "1hgshYGWK8V-eGRy8LToWJJgDU_rXWVJ3",
  53. "d5b63470df7cfa627aeec8b9dc0c066e",
  54. ),
  55. "targets": (
  56. "camelyonpatch_level_2_split_valid_y.h5",
  57. "1bH8ZRbhSVAhScTS0p9-ZzGnX91cHT3uO",
  58. "2b85f58b927af9964a4c15b8f7e8f179",
  59. ),
  60. },
  61. }
  62. def __init__(
  63. self,
  64. root: Union[str, pathlib.Path],
  65. split: str = "train",
  66. transform: Optional[Callable] = None,
  67. target_transform: Optional[Callable] = None,
  68. download: bool = False,
  69. ):
  70. try:
  71. import h5py
  72. self.h5py = h5py
  73. except ImportError:
  74. raise RuntimeError(
  75. "h5py is not found. This dataset needs to have h5py installed: please run pip install h5py"
  76. )
  77. self._split = verify_str_arg(split, "split", ("train", "test", "val"))
  78. super().__init__(root, transform=transform, target_transform=target_transform)
  79. self._base_folder = pathlib.Path(self.root) / "pcam"
  80. if download:
  81. self._download()
  82. if not self._check_exists():
  83. raise RuntimeError("Dataset not found. You can use download=True to download it")
  84. def __len__(self) -> int:
  85. images_file = self._FILES[self._split]["images"][0]
  86. with self.h5py.File(self._base_folder / images_file) as images_data:
  87. return images_data["x"].shape[0]
  88. def __getitem__(self, idx: int) -> Tuple[Any, Any]:
  89. images_file = self._FILES[self._split]["images"][0]
  90. with self.h5py.File(self._base_folder / images_file) as images_data:
  91. image = Image.fromarray(images_data["x"][idx]).convert("RGB")
  92. targets_file = self._FILES[self._split]["targets"][0]
  93. with self.h5py.File(self._base_folder / targets_file) as targets_data:
  94. target = int(targets_data["y"][idx, 0, 0, 0]) # shape is [num_images, 1, 1, 1]
  95. if self.transform:
  96. image = self.transform(image)
  97. if self.target_transform:
  98. target = self.target_transform(target)
  99. return image, target
  100. def _check_exists(self) -> bool:
  101. images_file = self._FILES[self._split]["images"][0]
  102. targets_file = self._FILES[self._split]["targets"][0]
  103. return all(self._base_folder.joinpath(h5_file).exists() for h5_file in (images_file, targets_file))
  104. def _download(self) -> None:
  105. if self._check_exists():
  106. return
  107. for file_name, file_id, md5 in self._FILES[self._split].values():
  108. archive_name = file_name + ".gz"
  109. download_file_from_google_drive(file_id, str(self._base_folder), filename=archive_name, md5=md5)
  110. _decompress(str(self._base_folder / archive_name))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...