Revamp notebook example (#110)

NicolasHug · facebook-github-bot · commit 78d2acba93a2 · 2024-07-29T08:08:54.000-07:00
Summary: This PR re-writes our notebook-style example to cover all methods, attributes and related classes of `SimpleVideoDecoder`. And also to show a cute puppy, ~~which will hopefully account for 95% of the success of this project~~. The example is a bit verbose, by design. I think it's fine, we'll add a much more compact and to-the-point example in the README. The example currently renders as follows: ![image](https://github.com/user-attachments/assets/8f399f39-f456-4b26-87ed-6b0a4b86ba12) Pull Request resolved: #110 Reviewed By: ahmadsharif1 Differential Revision: D60373589 Pulled By: NicolasHug fbshipit-source-id: 5b8ce234d7a687b98bdb69c1272f12c4983f797f
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
 max-line-length = 120
-ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE
+ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE, E251, E202
 per-file-ignores =
     __init__.py: F401, F403, F405
diff --git a/examples/basic_example.py b/examples/basic_example.py
@@ -5,48 +5,159 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-==================================================
-Basic Example to use TorchCodec to decode a video.
-==================================================
+========================================
+Decoding a video with SimpleVideoDecoder
+========================================
 
-A simple example showing how to decode the first few frames of a video  using
-the :class:`~torchcodec.decoders.SimpleVideoDecoder` class.
+In this example, we'll learn how to decode a video using the
+:class:`~torchcodec.decoders.SimpleVideoDecoder` class.
 """
 
 # %%
-import inspect
-import os
+# First, a bit of boilerplate: we'll download a video from the web, and define a
+# plotting utility. You can ignore that part and jump right below to
+# :ref:`creating_decoder`.
 
+from typing import Optional
+import torch
+import requests
+
+
+# Video source: https://www.pexels.com/video/dog-eating-854132/
+# License: CC0. Author: Coverr.
+url = "https://videos.pexels.com/video-files/854132/854132-sd_640_360_25fps.mp4"
+response = requests.get(url)
+if response.status_code != 200:
+    raise RuntimeError(f"Failed to download video. {response.status_code = }.")
+
+raw_video_bytes = response.content
+
+
+def plot(frames: torch.Tensor, title : Optional[str] = None):
+    try:
+        from torchvision.utils import make_grid
+        from torchvision.transforms.v2.functional import to_pil_image
+        import matplotlib.pyplot as plt
+    except ImportError:
+        print("Cannot plot, please run `pip install torchvision matplotlib`")
+        return
+
+    plt.rcParams["savefig.bbox"] = 'tight'
+    fig, ax = plt.subplots()
+    ax.imshow(to_pil_image(make_grid(frames)))
+    ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+    if title is not None:
+        ax.set_title(title)
+    plt.tight_layout()
+
+
+# %%
+# .. _creating_decoder:
+#
+# Creating a decoder
+# ------------------
+#
+# We can now create a decoder from the raw (encoded) video bytes. You can of
+# course use a local video file and pass the path as input, rather than download
+# a video.
 from torchcodec.decoders import SimpleVideoDecoder
 
+# You can also pass a path to a local file!
+decoder = SimpleVideoDecoder(raw_video_bytes)
+
 # %%
-my_path = os.path.abspath(inspect.getfile(inspect.currentframe()))
-video_file_path = os.path.dirname(my_path) + "/../test/resources/nasa_13013.mp4"
-simple_decoder = SimpleVideoDecoder(video_file_path)
+# The has not yet been decoded by the decoder, but we already have access to
+# some metadata via the ``metadata`` attribute which is a
+# :class:`~torchcodec.decoders.VideoStreamMetadata` object.
+print(decoder.metadata)
 
 # %%
-# You can get the total frame count for the best video stream by calling len().
-num_frames = len(simple_decoder)
-print(f"{video_file_path=} has {num_frames} frames")
+# Decoding frames by indexing the decoder
+# ---------------------------------------
+
+first_frame = decoder[0]  # using a single int index
+every_twenty_frame = decoder[0 : -1 : 20]  # using slices
+
+print(f"{first_frame.shape = }")
+print(f"{first_frame.dtype = }")
+print(f"{every_twenty_frame.shape = }")
+print(f"{every_twenty_frame.dtype = }")
 
 # %%
-# You can get the decoded frame by using the subscript operator.
-first_frame = simple_decoder[0]
-print(f"decoded frame has type {type(first_frame)}")
+# Indexing the decoder returns the frames as :class:`torch.Tensor` objects.
+# By default, the shape of the frames is ``(N, C, H, W)`` where N is the batch
+# size C the number of channels, H is the height, and W is the width of the
+# frames.  The batch dimension N is only present when we're decoding more than
+# one frame. The dimension order can be changed to ``N, H, W, C`` using the
+# ``dimension_order`` parameter of
+# :class:`~torchcodec.decoders.SimpleVideoDecoder`. Frames are always of
+# ``torch.uint8`` dtype.
+#
+
+plot(first_frame, "First frame")
+
+# %%
+plot(every_twenty_frame, "Every 20 frame")
+
+# %%
+# Iterating over frames
+# ---------------------
+#
+# The decoder is a normal iterable object and can be iterated over like so:
+
+for frame in decoder:
+    assert (
+        isinstance(frame, torch.Tensor)
+        and frame.shape == (3, decoder.metadata.height, decoder.metadata.width)
+    )
 
 # %%
-# The shape of the decoded frame is (H, W, C) where H and W are the height
-# and width of the video frame. C is 3 because we have 3 channels red, green,
-# and blue.
-print(f"{first_frame.shape=}")
+# Retrieving pts and duration of frames
+# -------------------------------------
+#
+# Indexing the decoder returns pure :class:`torch.Tensor` objects. Sometimes, it
+# can be useful to retrieve additional information about the frames, such as
+# their :term:`pts` (Presentation Time Stamp), and their duration.
+# This can be achieved using the
+# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_at` and
+# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frames_at`  methods, which
+# will return a :class:`~torchcodec.decoders.Frame` and
+# :class:`~torchcodec.decoders.FrameBatch` objects respectively.
+
+last_frame = decoder.get_frame_at(len(decoder) - 1)
+print(f"{type(last_frame) = }")
+print(last_frame)
+
+# %%
+middle_frames = decoder.get_frames_at(start=10, stop=20, step=2)
+print(f"{type(middle_frames) = }")
+print(middle_frames)
 
 # %%
-# The dtype of the decoded frame is ``torch.uint8``.
-print(f"{first_frame.dtype=}")
+plot(last_frame.data, "Last frame")
+plot(middle_frames.data, "Middle frames")
 
 # %%
-# Negative indexes are supported.
-last_frame = simple_decoder[-1]
-print(f"{last_frame.shape=}")
+# Both :class:`~torchcodec.decoders.Frame` and
+# :class:`~torchcodec.decoders.FrameBatch` have a ``data`` field, which contains
+# the decoded tensor data. They also have the ``pts_seconds`` and
+# ``duration_seconds`` fields which are single ints for
+# :class:`~torchcodec.decoders.Frame`, and 1-D :class:`torch.Tensor` for
+# :class:`~torchcodec.decoders.FrameBatch` (one value per frame in the batch).
+
+# %%
+# Using time-based indexing
+# -------------------------
+#
+# So far, we have retrieved frames based on their index. We can also retrieve
+# frames based on *when* they are displayed.  The available method are
+# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frame_displayed_at` and
+# :meth:`~torchcodec.decoders.SimpleVideoDecoder.get_frames_displayed_at`, which
+# also return :class:`~torchcodec.decoders.Frame` and
+# :class:`~torchcodec.decoders.FrameBatch` objects respectively.
 
-# TODO_BEFORE_RELEASE: add documentation for slices and metadata.
+frame_at_2_seconds = decoder.get_frame_displayed_at(seconds=2)
+print(f"{type(frame_at_2_seconds) = }")
+print(frame_at_2_seconds)
+plot(frame_at_2_seconds.data, "Frame displayed at 2 seconds")
+# TODO_BEFORE_RELEASE: illustrate get_frames_displayed_at
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,3 +29,9 @@ first_party_detection = false
 
 [tool.black]
 target-version = ["py38"]
+
+[tool.ufmt]
+
+excludes = [
+    "examples",
+]
diff --git a/src/torchcodec/decoders/_simple_video_decoder.py b/src/torchcodec/decoders/_simple_video_decoder.py
@@ -14,8 +14,8 @@
 from torchcodec.decoders import _core as core
 
 
-def _frame_str(self):
-    # Utility to replace Frame and FrameBatch __str__ method. This prints the
+def _frame_repr(self):
+    # Utility to replace Frame and FrameBatch __repr__ method. This prints the
     # shape of the .data tensor rather than printing the (potentially very long)
     # data tensor itself.
     s = self.__class__.__name__ + ":\n"
@@ -46,7 +46,7 @@ def __iter__(self) -> Iterator[Union[Tensor, float]]:
             yield getattr(self, field.name)
 
     def __repr__(self):
-        return _frame_str(self)
+        return _frame_repr(self)
 
 
 @dataclass
@@ -65,7 +65,7 @@ def __iter__(self) -> Iterator[Union[Tensor, float]]:
             yield getattr(self, field.name)
 
     def __repr__(self):
-        return _frame_str(self)
+        return _frame_repr(self)
 
 
 _ERROR_REPORTING_INSTRUCTIONS = """