Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

ucf101.py 5.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
  1. import os
  2. from pathlib import Path
  3. from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  4. from torch import Tensor
  5. from .folder import find_classes, make_dataset
  6. from .video_utils import VideoClips
  7. from .vision import VisionDataset
  8. class UCF101(VisionDataset):
  9. """
  10. `UCF101 <https://www.crcv.ucf.edu/data/UCF101.php>`_ dataset.
  11. UCF101 is an action recognition video dataset.
  12. This dataset consider every video as a collection of video clips of fixed size, specified
  13. by ``frames_per_clip``, where the step in frames between each clip is given by
  14. ``step_between_clips``. The dataset itself can be downloaded from the dataset website;
  15. annotations that ``annotation_path`` should be pointing to can be downloaded from `here
  16. <https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip>`_.
  17. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
  18. and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
  19. elements will come from video 1, and the next three elements from video 2.
  20. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
  21. frames in a video might be present.
  22. Internally, it uses a VideoClips object to handle clip creation.
  23. Args:
  24. root (str or ``pathlib.Path``): Root directory of the UCF101 Dataset.
  25. annotation_path (str): path to the folder containing the split files;
  26. see docstring above for download instructions of these files
  27. frames_per_clip (int): number of frames in a clip.
  28. step_between_clips (int, optional): number of frames between each clip.
  29. fold (int, optional): which fold to use. Should be between 1 and 3.
  30. train (bool, optional): if ``True``, creates a dataset from the train split,
  31. otherwise from the ``test`` split.
  32. transform (callable, optional): A function/transform that takes in a TxHxWxC video
  33. and returns a transformed version.
  34. output_format (str, optional): The format of the output video tensors (before transforms).
  35. Can be either "THWC" (default) or "TCHW".
  36. Returns:
  37. tuple: A 3-tuple with the following entries:
  38. - video (Tensor[T, H, W, C] or Tensor[T, C, H, W]): The `T` video frames
  39. - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
  40. and `L` is the number of points
  41. - label (int): class of the video clip
  42. """
  43. def __init__(
  44. self,
  45. root: Union[str, Path],
  46. annotation_path: str,
  47. frames_per_clip: int,
  48. step_between_clips: int = 1,
  49. frame_rate: Optional[int] = None,
  50. fold: int = 1,
  51. train: bool = True,
  52. transform: Optional[Callable] = None,
  53. _precomputed_metadata: Optional[Dict[str, Any]] = None,
  54. num_workers: int = 1,
  55. _video_width: int = 0,
  56. _video_height: int = 0,
  57. _video_min_dimension: int = 0,
  58. _audio_samples: int = 0,
  59. output_format: str = "THWC",
  60. ) -> None:
  61. super().__init__(root)
  62. if not 1 <= fold <= 3:
  63. raise ValueError(f"fold should be between 1 and 3, got {fold}")
  64. extensions = ("avi",)
  65. self.fold = fold
  66. self.train = train
  67. self.classes, class_to_idx = find_classes(self.root)
  68. self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None)
  69. video_list = [x[0] for x in self.samples]
  70. video_clips = VideoClips(
  71. video_list,
  72. frames_per_clip,
  73. step_between_clips,
  74. frame_rate,
  75. _precomputed_metadata,
  76. num_workers=num_workers,
  77. _video_width=_video_width,
  78. _video_height=_video_height,
  79. _video_min_dimension=_video_min_dimension,
  80. _audio_samples=_audio_samples,
  81. output_format=output_format,
  82. )
  83. # we bookkeep the full version of video clips because we want to be able
  84. # to return the metadata of full version rather than the subset version of
  85. # video clips
  86. self.full_video_clips = video_clips
  87. self.indices = self._select_fold(video_list, annotation_path, fold, train)
  88. self.video_clips = video_clips.subset(self.indices)
  89. self.transform = transform
  90. @property
  91. def metadata(self) -> Dict[str, Any]:
  92. return self.full_video_clips.metadata
  93. def _select_fold(self, video_list: List[str], annotation_path: str, fold: int, train: bool) -> List[int]:
  94. name = "train" if train else "test"
  95. name = f"{name}list{fold:02d}.txt"
  96. f = os.path.join(annotation_path, name)
  97. selected_files = set()
  98. with open(f) as fid:
  99. data = fid.readlines()
  100. data = [x.strip().split(" ")[0] for x in data]
  101. data = [os.path.join(self.root, *x.split("/")) for x in data]
  102. selected_files.update(data)
  103. indices = [i for i in range(len(video_list)) if video_list[i] in selected_files]
  104. return indices
  105. def __len__(self) -> int:
  106. return self.video_clips.num_clips()
  107. def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]:
  108. video, audio, info, video_idx = self.video_clips.get_clip(idx)
  109. label = self.samples[self.indices[video_idx]][1]
  110. if self.transform is not None:
  111. video = self.transform(video)
  112. return video, audio, label
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...