From 3d948dc547ab569fe986471ff10aa0cb8dfd95b1 Mon Sep 17 00:00:00 2001 From: frgfm Date: Tue, 13 Jul 2021 01:15:51 +0200 Subject: [PATCH 1/7] style: Fixed last missing typing annotation --- torchvision/datasets/video_utils.py | 67 +++++++++++++++-------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 987270c4cd4..7c0a100bb22 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -2,7 +2,7 @@ import math import warnings from fractions import Fraction -from typing import List +from typing import Any, Dict, List, Optional import torch from torchvision.io import ( @@ -10,6 +10,7 @@ _read_video_from_file, read_video, read_video_timestamps, + VideoMetaData, ) from .utils import tqdm @@ -27,7 +28,7 @@ def pts_convert(pts, timebase_from, timebase_to, round_func=math.floor): return round_func(new_pts) -def unfold(tensor, size, step, dilation=1): +def unfold(tensor: Tensor, size: int, step: int, dilation=1) -> List[int]: """ similar to tensor.unfold, but with the dilation and specialized for 1d tensors @@ -55,17 +56,17 @@ class _VideoTimestampsDataset(object): pickled when forking. """ - def __init__(self, video_paths: List[str]): + def __init__(self, video_paths: List[str]) -> None: self.video_paths = video_paths - def __len__(self): + def __len__(self) -> int: return len(self.video_paths) - def __getitem__(self, idx): + def __getitem__(self, idx: int) -> Tuple[List[int], Optional[float]]: return read_video_timestamps(self.video_paths[idx]) -def _collate_fn(x): +def _collate_fn(x: Any) -> Any: """ Dummy collate function to be used with _VideoTimestampsDataset """ @@ -100,19 +101,19 @@ class VideoClips(object): def __init__( self, - video_paths, - clip_length_in_frames=16, - frames_between_clips=1, - frame_rate=None, - _precomputed_metadata=None, - num_workers=0, - _video_width=0, - _video_height=0, - _video_min_dimension=0, - _video_max_dimension=0, - _audio_samples=0, - _audio_channels=0, - ): + video_paths: List[str], + clip_length_in_frames: int = 16, + frames_between_clips: int = 1, + frame_rate: Optional[int] = None, + _precomputed_metadata: Optional[Dict[str, Any]] = None, + num_workers: int = 0, + _video_width: int = 0, + _video_height: int = 0, + _video_min_dimension: int = 0, + _video_max_dimension: int = 0, + _audio_samples: int = 0, + _audio_channels: int = 0, + ) -> None: self.video_paths = video_paths self.num_workers = num_workers @@ -131,7 +132,7 @@ def __init__( self._init_from_metadata(_precomputed_metadata) self.compute_clips(clip_length_in_frames, frames_between_clips, frame_rate) - def _compute_frame_pts(self): + def _compute_frame_pts(self) -> None: self.video_pts = [] self.video_fps = [] @@ -157,7 +158,7 @@ def _compute_frame_pts(self): self.video_pts.extend(clips) self.video_fps.extend(fps) - def _init_from_metadata(self, metadata): + def _init_from_metadata(self, metadata: Dict[str, Any]) -> None: self.video_paths = metadata["video_paths"] assert len(self.video_paths) == len(metadata["video_pts"]) self.video_pts = metadata["video_pts"] @@ -165,7 +166,7 @@ def _init_from_metadata(self, metadata): self.video_fps = metadata["video_fps"] @property - def metadata(self): + def metadata(self) -> Dict[str, Any]: _metadata = { "video_paths": self.video_paths, "video_pts": self.video_pts, @@ -173,7 +174,7 @@ def metadata(self): } return _metadata - def subset(self, indices): + def subset(self, indices: List[int]) -> Any: video_paths = [self.video_paths[i] for i in indices] video_pts = [self.video_pts[i] for i in indices] video_fps = [self.video_fps[i] for i in indices] @@ -198,7 +199,7 @@ def subset(self, indices): ) @staticmethod - def compute_clips_for_video(video_pts, num_frames, step, fps, frame_rate): + def compute_clips_for_video(video_pts, num_frames, step, fps, frame_rate) -> Tuple[, List[int]]: if fps is None: # if for some reason the video doesn't have fps (because doesn't have a video stream) # set the fps to 1. The value doesn't matter, because video_pts is empty anyway @@ -220,7 +221,7 @@ def compute_clips_for_video(video_pts, num_frames, step, fps, frame_rate): idxs = unfold(idxs, num_frames, step) return clips, idxs - def compute_clips(self, num_frames, step, frame_rate=None): + def compute_clips(self, num_frames: int, step, frame_rate: Optional[int] = None) -> None: """ Compute all consecutive sequences of clips from video_pts. Always returns clips of size `num_frames`, meaning that the @@ -245,19 +246,19 @@ def compute_clips(self, num_frames, step, frame_rate=None): clip_lengths = torch.as_tensor([len(v) for v in self.clips]) self.cumulative_sizes = clip_lengths.cumsum(0).tolist() - def __len__(self): + def __len__(self) -> int: return self.num_clips() - def num_videos(self): + def num_videos(self) -> int: return len(self.video_paths) - def num_clips(self): + def num_clips(self) -> int: """ Number of subclips that are available in the video list. """ return self.cumulative_sizes[-1] - def get_clip_location(self, idx): + def get_clip_location(self, idx: int) -> Tuple[int, int]: """ Converts a flattened representation of the indices into a video_idx, clip_idx representation. @@ -270,7 +271,7 @@ def get_clip_location(self, idx): return video_idx, clip_idx @staticmethod - def _resample_video_idx(num_frames, original_fps, new_fps): + def _resample_video_idx(num_frames: int, original_fps: int, new_fps: int) -> Tensor: step = float(original_fps) / new_fps if step.is_integer(): # optimization: if step is integer, don't need to perform @@ -281,7 +282,7 @@ def _resample_video_idx(num_frames, original_fps, new_fps): idxs = idxs.floor().to(torch.int64) return idxs - def get_clip(self, idx): + def get_clip(self, idx: int) -> Tuple[Tensor, Tensor, VideoMetaData, int]: """ Gets a subclip from a list of videos. @@ -381,7 +382,7 @@ def get_clip(self, idx): ) return video, audio, info, video_idx - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: video_pts_sizes = [len(v) for v in self.video_pts] # To be back-compatible, we convert data to dtype torch.long as needed # because for empty list, in legacy implementation, torch.as_tensor will @@ -409,7 +410,7 @@ def __getstate__(self): d["_version"] = 2 return d - def __setstate__(self, d): + def __setstate__(self, d: Dict[str, Any]) -> None: # for backwards-compatibility if "_version" not in d: self.__dict__ = d From ef8ef3b41d0215089c8797bd645169b1c14aca26 Mon Sep 17 00:00:00 2001 From: frgfm Date: Tue, 13 Jul 2021 11:47:40 +0200 Subject: [PATCH 2/7] style: Fixed typing --- torchvision/datasets/video_utils.py | 41 ++++++++++++++++++----------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 7c0a100bb22..84d65df4247 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -2,7 +2,7 @@ import math import warnings from fractions import Fraction -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Callable, Union, Tuple import torch from torchvision.io import ( @@ -16,7 +16,12 @@ from .utils import tqdm -def pts_convert(pts, timebase_from, timebase_to, round_func=math.floor): +def pts_convert( + pts: int, + timebase_from: Fraction, + timebase_to: Fraction, + round_func: Callable = math.floor +) -> float: """convert pts between different time bases Args: pts: presentation timestamp, float @@ -28,7 +33,7 @@ def pts_convert(pts, timebase_from, timebase_to, round_func=math.floor): return round_func(new_pts) -def unfold(tensor: Tensor, size: int, step: int, dilation=1) -> List[int]: +def unfold(tensor: torch.Tensor, size: int, step: int, dilation: int = 1) -> torch.Tensor: """ similar to tensor.unfold, but with the dilation and specialized for 1d tensors @@ -199,7 +204,13 @@ def subset(self, indices: List[int]) -> Any: ) @staticmethod - def compute_clips_for_video(video_pts, num_frames, step, fps, frame_rate) -> Tuple[, List[int]]: + def compute_clips_for_video( + video_pts: torch.Tensor, + num_frames: int, + step: int, + fps: int, + frame_rate: Optional[int] = None + ) -> Tuple[torch.Tensor, Union[List[slice], torch.Tensor]]: if fps is None: # if for some reason the video doesn't have fps (because doesn't have a video stream) # set the fps to 1. The value doesn't matter, because video_pts is empty anyway @@ -221,7 +232,7 @@ def compute_clips_for_video(video_pts, num_frames, step, fps, frame_rate) -> Tup idxs = unfold(idxs, num_frames, step) return clips, idxs - def compute_clips(self, num_frames: int, step, frame_rate: Optional[int] = None) -> None: + def compute_clips(self, num_frames: int, step: int, frame_rate: Optional[int] = None) -> None: """ Compute all consecutive sequences of clips from video_pts. Always returns clips of size `num_frames`, meaning that the @@ -271,7 +282,7 @@ def get_clip_location(self, idx: int) -> Tuple[int, int]: return video_idx, clip_idx @staticmethod - def _resample_video_idx(num_frames: int, original_fps: int, new_fps: int) -> Tensor: + def _resample_video_idx(num_frames: int, original_fps: int, new_fps: int) -> Union[slice, torch.Tensor]: step = float(original_fps) / new_fps if step.is_integer(): # optimization: if step is integer, don't need to perform @@ -282,7 +293,7 @@ def _resample_video_idx(num_frames: int, original_fps: int, new_fps: int) -> Ten idxs = idxs.floor().to(torch.int64) return idxs - def get_clip(self, idx: int) -> Tuple[Tensor, Tensor, VideoMetaData, int]: + def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any], int]: """ Gets a subclip from a list of videos. @@ -330,21 +341,21 @@ def get_clip(self, idx: int) -> Tuple[Tensor, Tensor, VideoMetaData, int]: end_pts = clip_pts[-1].item() video, audio, info = read_video(video_path, start_pts, end_pts) else: - info = _probe_video_from_file(video_path) - video_fps = info.video_fps + _info = _probe_video_from_file(video_path) + video_fps = _info.video_fps audio_fps = None - video_start_pts = clip_pts[0].item() - video_end_pts = clip_pts[-1].item() + video_start_pts: int = clip_pts[0].item() + video_end_pts: int = clip_pts[-1].item() audio_start_pts, audio_end_pts = 0, -1 audio_timebase = Fraction(0, 1) video_timebase = Fraction( - info.video_timebase.numerator, info.video_timebase.denominator + _info.video_timebase.numerator, _info.video_timebase.denominator ) - if info.has_audio: + if _info.has_audio: audio_timebase = Fraction( - info.audio_timebase.numerator, info.audio_timebase.denominator + _info.audio_timebase.numerator, _info.audio_timebase.denominator ) audio_start_pts = pts_convert( video_start_pts, video_timebase, audio_timebase, math.floor @@ -352,7 +363,7 @@ def get_clip(self, idx: int) -> Tuple[Tensor, Tensor, VideoMetaData, int]: audio_end_pts = pts_convert( video_end_pts, video_timebase, audio_timebase, math.ceil ) - audio_fps = info.audio_sample_rate + audio_fps = _info.audio_sample_rate video, audio, info = _read_video_from_file( video_path, video_width=self._video_width, From c40bac5d04d873374f7876bee58d3dd912ef621a Mon Sep 17 00:00:00 2001 From: frgfm Date: Tue, 13 Jul 2021 12:01:00 +0200 Subject: [PATCH 3/7] style: Fixed remaining typing annotations --- torchvision/datasets/video_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 84d65df4247..501c78d8b7e 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -21,7 +21,7 @@ def pts_convert( timebase_from: Fraction, timebase_to: Fraction, round_func: Callable = math.floor -) -> float: +) -> int: """convert pts between different time bases Args: pts: presentation timestamp, float @@ -145,8 +145,8 @@ def _compute_frame_pts(self) -> None: # so need to create a dummy dataset first import torch.utils.data - dl = torch.utils.data.DataLoader( - _VideoTimestampsDataset(self.video_paths), + dl: torch.utils.data.DataLoader = torch.utils.data.DataLoader( + _VideoTimestampsDataset(self.video_paths), # type: ignore[arg-type] batch_size=16, num_workers=self.num_workers, collate_fn=_collate_fn, @@ -227,10 +227,10 @@ def compute_clips_for_video( warnings.warn("There aren't enough frames in the current video to get a clip for the given clip length and " "frames between clips. The video (and potentially others) will be skipped.") if isinstance(idxs, slice): - idxs = [idxs] * len(clips) + idxs = [idxs] * len(clips) # type: ignore[assignment] else: idxs = unfold(idxs, num_frames, step) - return clips, idxs + return clips, idxs # type: ignore[return-value] def compute_clips(self, num_frames: int, step: int, frame_rate: Optional[int] = None) -> None: """ @@ -345,8 +345,8 @@ def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any] video_fps = _info.video_fps audio_fps = None - video_start_pts: int = clip_pts[0].item() - video_end_pts: int = clip_pts[-1].item() + video_start_pts = int(clip_pts[0].item()) + video_end_pts = int(clip_pts[-1].item()) audio_start_pts, audio_end_pts = 0, -1 audio_timebase = Fraction(0, 1) @@ -402,10 +402,10 @@ def __getstate__(self) -> Dict[str, Any]: video_pts = [x.to(torch.int64) for x in self.video_pts] # video_pts can be an empty list if no frames have been decoded if video_pts: - video_pts = torch.cat(video_pts) + video_pts = torch.cat(video_pts) # type: ignore[assignment] # avoid bug in https://github.com/pytorch/pytorch/issues/32351 # TODO: Revert it once the bug is fixed. - video_pts = video_pts.numpy() + video_pts = video_pts.numpy() # type: ignore[attr-defined] # make a copy of the fields of self d = self.__dict__.copy() From d2e85eaa1b14102a32166c863ee9f786b83d9041 Mon Sep 17 00:00:00 2001 From: frgfm Date: Fri, 30 Jul 2021 10:42:57 +0200 Subject: [PATCH 4/7] style: Fixed typing --- torchvision/datasets/video_utils.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 501c78d8b7e..cd570773d2a 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -2,7 +2,7 @@ import math import warnings from fractions import Fraction -from typing import Any, Dict, List, Optional, Callable, Union, Tuple +from typing import Any, Dict, List, Optional, Callable, Union, Tuple, TypeVar, cast import torch from torchvision.io import ( @@ -15,6 +15,8 @@ from .utils import tqdm +T = TypeVar("T") + def pts_convert( pts: int, @@ -71,7 +73,7 @@ def __getitem__(self, idx: int) -> Tuple[List[int], Optional[float]]: return read_video_timestamps(self.video_paths[idx]) -def _collate_fn(x: Any) -> Any: +def _collate_fn(x: T) -> T: """ Dummy collate function to be used with _VideoTimestampsDataset """ @@ -179,7 +181,7 @@ def metadata(self) -> Dict[str, Any]: } return _metadata - def subset(self, indices: List[int]) -> Any: + def subset(self, indices: List[int]) -> "VideoClips": video_paths = [self.video_paths[i] for i in indices] video_pts = [self.video_pts[i] for i in indices] video_fps = [self.video_fps[i] for i in indices] @@ -218,19 +220,20 @@ def compute_clips_for_video( if frame_rate is None: frame_rate = fps total_frames = len(video_pts) * (float(frame_rate) / fps) - idxs = VideoClips._resample_video_idx( + _idxs = VideoClips._resample_video_idx( int(math.floor(total_frames)), fps, frame_rate ) - video_pts = video_pts[idxs] + video_pts = video_pts[_idxs] clips = unfold(video_pts, num_frames, step) if not clips.numel(): warnings.warn("There aren't enough frames in the current video to get a clip for the given clip length and " "frames between clips. The video (and potentially others) will be skipped.") - if isinstance(idxs, slice): - idxs = [idxs] * len(clips) # type: ignore[assignment] + idxs: Union[List[slice], torch.Tensor] + if isinstance(_idxs, slice): + idxs = [_idxs] * len(clips) else: - idxs = unfold(idxs, num_frames, step) - return clips, idxs # type: ignore[return-value] + idxs = unfold(_idxs, num_frames, step) + return clips, idxs def compute_clips(self, num_frames: int, step: int, frame_rate: Optional[int] = None) -> None: """ @@ -345,8 +348,8 @@ def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any] video_fps = _info.video_fps audio_fps = None - video_start_pts = int(clip_pts[0].item()) - video_end_pts = int(clip_pts[-1].item()) + video_start_pts = cast(int, clip_pts[0].item()) + video_end_pts = cast(int, clip_pts[-1].item()) audio_start_pts, audio_end_pts = 0, -1 audio_timebase = Fraction(0, 1) From a29a1c8302987a5d8e461ff0e4c351320c8c1e84 Mon Sep 17 00:00:00 2001 From: frgfm Date: Thu, 18 Nov 2021 10:16:01 +0100 Subject: [PATCH 5/7] style: Fixed typing --- torchvision/datasets/video_utils.py | 33 +++++++++++------------------ 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 784ef78dd84..06a11f315cd 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -18,12 +18,7 @@ T = TypeVar("T") -def pts_convert( - pts: int, - timebase_from: Fraction, - timebase_to: Fraction, - round_func: Callable = math.floor -) -> int: +def pts_convert(pts: int, timebase_from: Fraction, timebase_to: Fraction, round_func: Callable = math.floor) -> int: """convert pts between different time bases Args: pts: presentation timestamp, float @@ -207,11 +202,7 @@ def subset(self, indices: List[int]) -> "VideoClips": @staticmethod def compute_clips_for_video( - video_pts: torch.Tensor, - num_frames: int, - step: int, - fps: int, - frame_rate: Optional[int] = None + video_pts: torch.Tensor, num_frames: int, step: int, fps: int, frame_rate: Optional[int] = None ) -> Tuple[torch.Tensor, Union[List[slice], torch.Tensor]]: if fps is None: # if for some reason the video doesn't have fps (because doesn't have a video stream) @@ -220,7 +211,7 @@ def compute_clips_for_video( if frame_rate is None: frame_rate = fps total_frames = len(video_pts) * (float(frame_rate) / fps) - idxs = VideoClips._resample_video_idx(int(math.floor(total_frames)), fps, frame_rate) + _idxs = VideoClips._resample_video_idx(int(math.floor(total_frames)), fps, frame_rate) video_pts = video_pts[idxs] clips = unfold(video_pts, num_frames, step) if not clips.numel(): @@ -228,11 +219,11 @@ def compute_clips_for_video( "There aren't enough frames in the current video to get a clip for the given clip length and " "frames between clips. The video (and potentially others) will be skipped." ) - # idxs: Union[List[slice], torch.Tensor] - if isinstance(idxs, slice): - idxs = [idxs] * len(clips) + idxs: Union[List[slice], torch.Tensor] + if isinstance(_idxs, slice): + idxs = [_idxs] * len(clips) else: - idxs = unfold(idxs, num_frames, step) + idxs = unfold(_idxs, num_frames, step) return clips, idxs def compute_clips(self, num_frames: int, step: int, frame_rate: Optional[int] = None) -> None: @@ -344,13 +335,13 @@ def get_clip(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any] audio_start_pts, audio_end_pts = 0, -1 audio_timebase = Fraction(0, 1) - video_timebase = Fraction(info.video_timebase.numerator, info.video_timebase.denominator) - if info.has_audio: - audio_timebase = Fraction(info.audio_timebase.numerator, info.audio_timebase.denominator) + video_timebase = Fraction(_info.video_timebase.numerator, _info.video_timebase.denominator) + if _info.has_audio: + audio_timebase = Fraction(_info.audio_timebase.numerator, _info.audio_timebase.denominator) audio_start_pts = pts_convert(video_start_pts, video_timebase, audio_timebase, math.floor) audio_end_pts = pts_convert(video_end_pts, video_timebase, audio_timebase, math.ceil) - audio_fps = info.audio_sample_rate - video, audio, info = _read_video_from_file( + audio_fps = _info.audio_sample_rate + video, audio, _ = _read_video_from_file( video_path, video_width=self._video_width, video_height=self._video_height, From 01620d34099fc2bf42ff4211f3dcb24d23f47b3d Mon Sep 17 00:00:00 2001 From: frgfm Date: Thu, 18 Nov 2021 14:29:07 +0100 Subject: [PATCH 6/7] refactor: Removed unused import --- torchvision/datasets/video_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 06a11f315cd..38677afd1a3 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -10,7 +10,6 @@ _read_video_from_file, read_video, read_video_timestamps, - VideoMetaData, ) from .utils import tqdm From bad5f6185a8fdf9dffaebcd6a3aaf1eebc1fe7fd Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 18 Nov 2021 17:07:28 +0100 Subject: [PATCH 7/7] Update torchvision/datasets/video_utils.py --- torchvision/datasets/video_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py index 38677afd1a3..efa3836c8d1 100644 --- a/torchvision/datasets/video_utils.py +++ b/torchvision/datasets/video_utils.py @@ -211,7 +211,7 @@ def compute_clips_for_video( frame_rate = fps total_frames = len(video_pts) * (float(frame_rate) / fps) _idxs = VideoClips._resample_video_idx(int(math.floor(total_frames)), fps, frame_rate) - video_pts = video_pts[idxs] + video_pts = video_pts[_idxs] clips = unfold(video_pts, num_frames, step) if not clips.numel(): warnings.warn(