diff --git a/docs/source/io.rst b/docs/source/io.rst index 6c21e68cf59..7bc3dee26fc 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -17,6 +17,40 @@ Video .. autofunction:: write_video +Fine-grained video API +------------------- + +In addition to the :mod:`read_video` function, we provide a high-performance +lower-level API for more fine-grained control compared to the :mod:`read_video` function. +It does all this whilst fully supporting torchscript. + +.. autoclass:: Video + :members: next, get_metadata, set_current_stream, seek + + +Example of usage: + +.. code:: python + + import torchvision + video_path = "path to a test video" + # Constructor allocates memory and a threaded decoder + # instance per video. At the momet it takes two arguments: + # path to the video file, and a wanted stream. + reader = torchvision.io.Video(video_path, "video") + + # The information about the video can be retrieved using the + # `get_metadata()` method. It returns a dictionary for every stream, with + # duration and other relevant metadata (often frame rate) + reader_md = reader.get_metadata() + + # metadata is structured as a dict of dicts with following structure + # {"stream_type": {"attribute": [attribute per stream]}} + # + # following would print out the list of frame rates for every present video stream + print(reader_md["video"]["fps"]) + + Image ----- diff --git a/test/test_video.py b/test/test_video.py index 63434fa9c1f..7dc6b8aeb93 100644 --- a/test/test_video.py +++ b/test/test_video.py @@ -10,7 +10,7 @@ import torch import torchvision -from torchvision.io import _HAS_VIDEO_OPT +from torchvision.io import _HAS_VIDEO_OPT, Video try: import av @@ -289,7 +289,7 @@ def test_read_video_tensor(self): tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec") tv_result = tv_result.permute(0, 3, 1, 2) # pass 2: decode all frames using new api - reader = torch.classes.torchvision.Video(full_path, "video") + reader = Video(full_path, "video") frames = [] t, _ = reader.next() while t.numel() > 0: @@ -310,7 +310,7 @@ def test_read_video_tensor(self): # s = min(r) # e = max(r) - # reader = torch.classes.torchvision.Video(full_path, "video") + # reader = Video(full_path, "video") # results = _template_read_video(reader, s, e) # tv_video, tv_audio, info = torchvision.io.read_video( # full_path, start_pts=s, end_pts=e, pts_unit="sec" @@ -329,7 +329,7 @@ def test_read_video_tensor(self): # full_path, pts_unit="sec" # ) # # pass 2: decode all frames using new api - # reader = torch.classes.torchvision.Video(full_path, "video") + # reader = Video(full_path, "video") # pts = [] # t, p = reader.next() # while t.numel() > 0: @@ -353,7 +353,7 @@ def test_metadata(self): torchvision.set_video_backend("pyav") for test_video, config in test_videos.items(): full_path = os.path.join(VIDEO_DIR, test_video) - reader = torch.classes.torchvision.Video(full_path, "video") + reader = Video(full_path, "video") reader_md = reader.get_metadata() self.assertAlmostEqual( config.video_fps, reader_md["video"]["fps"][0], delta=0.0001 @@ -372,7 +372,7 @@ def test_video_reading_fn(self): ref_result = _decode_frames_by_av_module(full_path) - reader = torch.classes.torchvision.Video(full_path, "video") + reader = Video(full_path, "video") newapi_result = _template_read_video(reader) # First we check if the frames are approximately the same diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py index 029dd311592..bdd9a88908a 100644 --- a/torchvision/io/__init__.py +++ b/torchvision/io/__init__.py @@ -1,3 +1,5 @@ +import torch + from ._video_opt import ( Timebase, VideoMetaData, @@ -20,10 +22,94 @@ encode_jpeg, write_jpeg, encode_png, - write_png + write_png, ) +if _HAS_VIDEO_OPT: + + class Video: + """ + Fine-grained video-reading API. + Supports frame-by-frame reading of various streams from a single video + container. + + Args: + + path (string): Path to the video file in supported format + + stream (string, optional): descriptor of the required stream. Defaults to "video:0" + Currently available options include :mod:`['video', 'audio', 'cc', 'sub']` + + Example: + The following examples creates :mod:`Video` object, seeks into 2s + point, and returns a single frame:: + import torchvision + video_path = "path_to_a_test_video" + + reader = torchvision.io.Video(video_path, "video") + reader.seek(2.0) + frame, timestamp = reader.next() + """ + + def __init__(self, path, stream="video"): + self._c = torch.classes.torchvision.Video(path, stream) + + def next(self): + """Iterator that decodes the next frame of the current stream + + Returns: + ([torch.Tensor, float]): list containing decoded frame and corresponding timestamp + + """ + return self._c.next() + + def seek(self, time_s: float): + """Seek within current stream. + + Args: + time_s (float): seek time in seconds + + .. note:: + Current implementation is the so-called precise seek. This + means following seek, call to :mod:`next()` will return the + frame with the exact timestamp if it exists or + the first frame with timestamp larger than time_s. + """ + self._c.seek(time_s) + + def get_metadata(self): + """Returns video metadata + + Returns: + (dict): dictionary containing duration and frame rate for every stream + """ + return self._c.get_metadata() + + def set_current_stream(self, stream: str): + """Set current stream. + Explicitly define the stream we are operating on. + + Args: + stream (string): descriptor of the required stream. Defaults to "video:0" + Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`. + Each descriptor consists of two parts: stream type (e.g. 'video') and + a unique stream id (which are determined by video encoding). + In this way, if the video contaner contains multiple + streams of the same type, users can acces the one they want. + If only stream type is passed, the decoder auto-detects first stream + of that type and returns it. + + Returns: + (bool): True on succes, False otherwise + """ + return self._c.set_current_stream(stream) + + +else: + Video = None + + __all__ = [ "write_video", "read_video", @@ -39,10 +125,11 @@ "_read_video_meta_data", "VideoMetaData", "Timebase", - 'read_image', - 'decode_image', - 'encode_jpeg', - 'write_jpeg', - 'encode_png', - 'write_png', + "read_image", + "decode_image", + "encode_jpeg", + "write_jpeg", + "encode_png", + "write_png", + "Video", ]