Skip to content

Commit d537965

Browse files
bjuncekfmassa
andauthored
[documentation] video API documentation and wrapper (#2778)
* initial API documentation attempt * test the docs * initial commit * updating test to match the registration * adding the warning on unsucessful import * Try to do conditional import * Simple fix? * clearing up docs * docstring commit * Adding types in arguments Co-authored-by: Francisco Massa <[email protected]> * reverting warning commit * addressing Francisco's comments * Apply suggestions from code review Co-authored-by: Francisco Massa <[email protected]> * Revert "reverting warning commit" This reverts commit bd1a3dd. * Revert "adding the warning on unsucessful import" This reverts commit afef7df. * remove warnings import Co-authored-by: Francisco Massa <[email protected]>
1 parent b217165 commit d537965

File tree

3 files changed

+134
-13
lines changed

3 files changed

+134
-13
lines changed

docs/source/io.rst

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,40 @@ Video
1717
.. autofunction:: write_video
1818

1919

20+
Fine-grained video API
21+
-------------------
22+
23+
In addition to the :mod:`read_video` function, we provide a high-performance
24+
lower-level API for more fine-grained control compared to the :mod:`read_video` function.
25+
It does all this whilst fully supporting torchscript.
26+
27+
.. autoclass:: Video
28+
:members: next, get_metadata, set_current_stream, seek
29+
30+
31+
Example of usage:
32+
33+
.. code:: python
34+
35+
import torchvision
36+
video_path = "path to a test video"
37+
# Constructor allocates memory and a threaded decoder
38+
# instance per video. At the momet it takes two arguments:
39+
# path to the video file, and a wanted stream.
40+
reader = torchvision.io.Video(video_path, "video")
41+
42+
# The information about the video can be retrieved using the
43+
# `get_metadata()` method. It returns a dictionary for every stream, with
44+
# duration and other relevant metadata (often frame rate)
45+
reader_md = reader.get_metadata()
46+
47+
# metadata is structured as a dict of dicts with following structure
48+
# {"stream_type": {"attribute": [attribute per stream]}}
49+
#
50+
# following would print out the list of frame rates for every present video stream
51+
print(reader_md["video"]["fps"])
52+
53+
2054
Image
2155
-----
2256

test/test_video.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import torch
1212
import torchvision
13-
from torchvision.io import _HAS_VIDEO_OPT
13+
from torchvision.io import _HAS_VIDEO_OPT, Video
1414

1515
try:
1616
import av
@@ -289,7 +289,7 @@ def test_read_video_tensor(self):
289289
tv_result, _, _ = torchvision.io.read_video(full_path, pts_unit="sec")
290290
tv_result = tv_result.permute(0, 3, 1, 2)
291291
# pass 2: decode all frames using new api
292-
reader = torch.classes.torchvision.Video(full_path, "video")
292+
reader = Video(full_path, "video")
293293
frames = []
294294
t, _ = reader.next()
295295
while t.numel() > 0:
@@ -310,7 +310,7 @@ def test_read_video_tensor(self):
310310
# s = min(r)
311311
# e = max(r)
312312

313-
# reader = torch.classes.torchvision.Video(full_path, "video")
313+
# reader = Video(full_path, "video")
314314
# results = _template_read_video(reader, s, e)
315315
# tv_video, tv_audio, info = torchvision.io.read_video(
316316
# full_path, start_pts=s, end_pts=e, pts_unit="sec"
@@ -329,7 +329,7 @@ def test_read_video_tensor(self):
329329
# full_path, pts_unit="sec"
330330
# )
331331
# # pass 2: decode all frames using new api
332-
# reader = torch.classes.torchvision.Video(full_path, "video")
332+
# reader = Video(full_path, "video")
333333
# pts = []
334334
# t, p = reader.next()
335335
# while t.numel() > 0:
@@ -353,7 +353,7 @@ def test_metadata(self):
353353
torchvision.set_video_backend("pyav")
354354
for test_video, config in test_videos.items():
355355
full_path = os.path.join(VIDEO_DIR, test_video)
356-
reader = torch.classes.torchvision.Video(full_path, "video")
356+
reader = Video(full_path, "video")
357357
reader_md = reader.get_metadata()
358358
self.assertAlmostEqual(
359359
config.video_fps, reader_md["video"]["fps"][0], delta=0.0001
@@ -372,7 +372,7 @@ def test_video_reading_fn(self):
372372

373373
ref_result = _decode_frames_by_av_module(full_path)
374374

375-
reader = torch.classes.torchvision.Video(full_path, "video")
375+
reader = Video(full_path, "video")
376376
newapi_result = _template_read_video(reader)
377377

378378
# First we check if the frames are approximately the same

torchvision/io/__init__.py

Lines changed: 94 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import torch
2+
13
from ._video_opt import (
24
Timebase,
35
VideoMetaData,
@@ -20,10 +22,94 @@
2022
encode_jpeg,
2123
write_jpeg,
2224
encode_png,
23-
write_png
25+
write_png,
2426
)
2527

2628

29+
if _HAS_VIDEO_OPT:
30+
31+
class Video:
32+
"""
33+
Fine-grained video-reading API.
34+
Supports frame-by-frame reading of various streams from a single video
35+
container.
36+
37+
Args:
38+
39+
path (string): Path to the video file in supported format
40+
41+
stream (string, optional): descriptor of the required stream. Defaults to "video:0"
42+
Currently available options include :mod:`['video', 'audio', 'cc', 'sub']`
43+
44+
Example:
45+
The following examples creates :mod:`Video` object, seeks into 2s
46+
point, and returns a single frame::
47+
import torchvision
48+
video_path = "path_to_a_test_video"
49+
50+
reader = torchvision.io.Video(video_path, "video")
51+
reader.seek(2.0)
52+
frame, timestamp = reader.next()
53+
"""
54+
55+
def __init__(self, path, stream="video"):
56+
self._c = torch.classes.torchvision.Video(path, stream)
57+
58+
def next(self):
59+
"""Iterator that decodes the next frame of the current stream
60+
61+
Returns:
62+
([torch.Tensor, float]): list containing decoded frame and corresponding timestamp
63+
64+
"""
65+
return self._c.next()
66+
67+
def seek(self, time_s: float):
68+
"""Seek within current stream.
69+
70+
Args:
71+
time_s (float): seek time in seconds
72+
73+
.. note::
74+
Current implementation is the so-called precise seek. This
75+
means following seek, call to :mod:`next()` will return the
76+
frame with the exact timestamp if it exists or
77+
the first frame with timestamp larger than time_s.
78+
"""
79+
self._c.seek(time_s)
80+
81+
def get_metadata(self):
82+
"""Returns video metadata
83+
84+
Returns:
85+
(dict): dictionary containing duration and frame rate for every stream
86+
"""
87+
return self._c.get_metadata()
88+
89+
def set_current_stream(self, stream: str):
90+
"""Set current stream.
91+
Explicitly define the stream we are operating on.
92+
93+
Args:
94+
stream (string): descriptor of the required stream. Defaults to "video:0"
95+
Currently available stream types include :mod:`['video', 'audio', 'cc', 'sub']`.
96+
Each descriptor consists of two parts: stream type (e.g. 'video') and
97+
a unique stream id (which are determined by video encoding).
98+
In this way, if the video contaner contains multiple
99+
streams of the same type, users can acces the one they want.
100+
If only stream type is passed, the decoder auto-detects first stream
101+
of that type and returns it.
102+
103+
Returns:
104+
(bool): True on succes, False otherwise
105+
"""
106+
return self._c.set_current_stream(stream)
107+
108+
109+
else:
110+
Video = None
111+
112+
27113
__all__ = [
28114
"write_video",
29115
"read_video",
@@ -39,10 +125,11 @@
39125
"_read_video_meta_data",
40126
"VideoMetaData",
41127
"Timebase",
42-
'read_image',
43-
'decode_image',
44-
'encode_jpeg',
45-
'write_jpeg',
46-
'encode_png',
47-
'write_png',
128+
"read_image",
129+
"decode_image",
130+
"encode_jpeg",
131+
"write_jpeg",
132+
"encode_png",
133+
"write_png",
134+
"Video",
48135
]

0 commit comments

Comments
 (0)