Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

hmdb51.py 5.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
  1. import glob
  2. import os
  3. from pathlib import Path
  4. from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  5. from torch import Tensor
  6. from .folder import find_classes, make_dataset
  7. from .video_utils import VideoClips
  8. from .vision import VisionDataset
  9. class HMDB51(VisionDataset):
  10. """
  11. `HMDB51 <https://serre-lab.clps.brown.edu/resource/hmdb-a-large-human-motion-database/>`_
  12. dataset.
  13. HMDB51 is an action recognition video dataset.
  14. This dataset consider every video as a collection of video clips of fixed size, specified
  15. by ``frames_per_clip``, where the step in frames between each clip is given by
  16. ``step_between_clips``.
  17. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
  18. and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
  19. elements will come from video 1, and the next three elements from video 2.
  20. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
  21. frames in a video might be present.
  22. Internally, it uses a VideoClips object to handle clip creation.
  23. Args:
  24. root (str or ``pathlib.Path``): Root directory of the HMDB51 Dataset.
  25. annotation_path (str): Path to the folder containing the split files.
  26. frames_per_clip (int): Number of frames in a clip.
  27. step_between_clips (int): Number of frames between each clip.
  28. fold (int, optional): Which fold to use. Should be between 1 and 3.
  29. train (bool, optional): If ``True``, creates a dataset from the train split,
  30. otherwise from the ``test`` split.
  31. transform (callable, optional): A function/transform that takes in a TxHxWxC video
  32. and returns a transformed version.
  33. output_format (str, optional): The format of the output video tensors (before transforms).
  34. Can be either "THWC" (default) or "TCHW".
  35. Returns:
  36. tuple: A 3-tuple with the following entries:
  37. - video (Tensor[T, H, W, C] or Tensor[T, C, H, W]): The `T` video frames
  38. - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
  39. and `L` is the number of points
  40. - label (int): class of the video clip
  41. """
  42. data_url = "https://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar"
  43. splits = {
  44. "url": "https://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/test_train_splits.rar",
  45. "md5": "15e67781e70dcfbdce2d7dbb9b3344b5",
  46. }
  47. TRAIN_TAG = 1
  48. TEST_TAG = 2
  49. def __init__(
  50. self,
  51. root: Union[str, Path],
  52. annotation_path: str,
  53. frames_per_clip: int,
  54. step_between_clips: int = 1,
  55. frame_rate: Optional[int] = None,
  56. fold: int = 1,
  57. train: bool = True,
  58. transform: Optional[Callable] = None,
  59. _precomputed_metadata: Optional[Dict[str, Any]] = None,
  60. num_workers: int = 1,
  61. _video_width: int = 0,
  62. _video_height: int = 0,
  63. _video_min_dimension: int = 0,
  64. _audio_samples: int = 0,
  65. output_format: str = "THWC",
  66. ) -> None:
  67. super().__init__(root)
  68. if fold not in (1, 2, 3):
  69. raise ValueError(f"fold should be between 1 and 3, got {fold}")
  70. extensions = ("avi",)
  71. self.classes, class_to_idx = find_classes(self.root)
  72. self.samples = make_dataset(
  73. self.root,
  74. class_to_idx,
  75. extensions,
  76. )
  77. video_paths = [path for (path, _) in self.samples]
  78. video_clips = VideoClips(
  79. video_paths,
  80. frames_per_clip,
  81. step_between_clips,
  82. frame_rate,
  83. _precomputed_metadata,
  84. num_workers=num_workers,
  85. _video_width=_video_width,
  86. _video_height=_video_height,
  87. _video_min_dimension=_video_min_dimension,
  88. _audio_samples=_audio_samples,
  89. output_format=output_format,
  90. )
  91. # we bookkeep the full version of video clips because we want to be able
  92. # to return the metadata of full version rather than the subset version of
  93. # video clips
  94. self.full_video_clips = video_clips
  95. self.fold = fold
  96. self.train = train
  97. self.indices = self._select_fold(video_paths, annotation_path, fold, train)
  98. self.video_clips = video_clips.subset(self.indices)
  99. self.transform = transform
  100. @property
  101. def metadata(self) -> Dict[str, Any]:
  102. return self.full_video_clips.metadata
  103. def _select_fold(self, video_list: List[str], annotations_dir: str, fold: int, train: bool) -> List[int]:
  104. target_tag = self.TRAIN_TAG if train else self.TEST_TAG
  105. split_pattern_name = f"*test_split{fold}.txt"
  106. split_pattern_path = os.path.join(annotations_dir, split_pattern_name)
  107. annotation_paths = glob.glob(split_pattern_path)
  108. selected_files = set()
  109. for filepath in annotation_paths:
  110. with open(filepath) as fid:
  111. lines = fid.readlines()
  112. for line in lines:
  113. video_filename, tag_string = line.split()
  114. tag = int(tag_string)
  115. if tag == target_tag:
  116. selected_files.add(video_filename)
  117. indices = []
  118. for video_index, video_path in enumerate(video_list):
  119. if os.path.basename(video_path) in selected_files:
  120. indices.append(video_index)
  121. return indices
  122. def __len__(self) -> int:
  123. return self.video_clips.num_clips()
  124. def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
  125. video, audio, _, video_idx = self.video_clips.get_clip(idx)
  126. sample_index = self.indices[video_idx]
  127. _, class_index = self.samples[sample_index]
  128. if self.transform is not None:
  129. video = self.transform(video)
  130. return video, audio, class_index
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...