Fixed missing audio with pyav backend (#4064)

prabhat00155 · web-flow · commit 693e0ae8d4ad · 2021-07-25T11:22:07.000+01:00
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
@@ -1,4 +1,5 @@
 import collections
+import itertools
 import math
 import os
 import unittest
@@ -1243,16 +1244,39 @@ def test_invalid_file(self):
         with self.assertRaises(RuntimeError):
             io.read_video('foo.mp4')
 
-    def test_audio_present(self):
-        """Test if audio frames are returned with video_reader backend."""
-        set_video_backend('video_reader')
+    def test_audio_present_pts(self):
+        """Test if audio frames are returned with pts unit."""
+        backends = ['video_reader', 'pyav']
+        start_offsets = [0, 1000]
+        end_offsets = [3000, None]
+        for test_video, _ in test_videos.items():
+            full_path = os.path.join(VIDEO_DIR, test_video)
+            container = av.open(full_path)
+            if container.streams.audio:
+                for backend, start_offset, end_offset in itertools.product(
+                        backends, start_offsets, end_offsets):
+                    set_video_backend(backend)
+                    _, audio, _ = io.read_video(
+                        full_path, start_offset, end_offset, pts_unit='pts')
+                    self.assertGreaterEqual(audio.shape[0], 1)
+                    self.assertGreaterEqual(audio.shape[1], 1)
+
+    def test_audio_present_sec(self):
+        """Test if audio frames are returned with sec unit."""
+        backends = ['video_reader', 'pyav']
+        start_offsets = [0, 0.1]
+        end_offsets = [0.3, None]
         for test_video, _ in test_videos.items():
             full_path = os.path.join(VIDEO_DIR, test_video)
             container = av.open(full_path)
             if container.streams.audio:
-                _, audio, _ = io.read_video(full_path)
-                self.assertGreaterEqual(audio.shape[0], 1)
-                self.assertGreaterEqual(audio.shape[1], 1)
+                for backend, start_offset, end_offset in itertools.product(
+                        backends, start_offsets, end_offsets):
+                    set_video_backend(backend)
+                    _, audio, _ = io.read_video(
+                        full_path, start_offset, end_offset, pts_unit='sec')
+                    self.assertGreaterEqual(audio.shape[0], 1)
+                    self.assertGreaterEqual(audio.shape[1], 1)
 
 
 if __name__ == "__main__":
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
@@ -283,22 +283,25 @@ def read_video(
     info = {}
     video_frames = []
     audio_frames = []
+    audio_timebase = _video_opt.default_timebase
 
     try:
         with av.open(filename, metadata_errors="ignore") as container:
+            if container.streams.audio:
+                audio_timebase = container.streams.audio[0].time_base
             time_base = _video_opt.default_timebase
             if container.streams.video:
                 time_base = container.streams.video[0].time_base
             elif container.streams.audio:
                 time_base = container.streams.audio[0].time_base
             # video_timebase is the default time_base
-            start_pts_sec, end_pts_sec, pts_unit = _video_opt._convert_to_sec(
+            start_pts, end_pts, pts_unit = _video_opt._convert_to_sec(
                 start_pts, end_pts, pts_unit, time_base)
             if container.streams.video:
                 video_frames = _read_from_stream(
                     container,
-                    start_pts_sec,
-                    end_pts_sec,
+                    start_pts,
+                    end_pts,
                     pts_unit,
                     container.streams.video[0],
                     {"video": 0},
@@ -311,8 +314,8 @@ def read_video(
             if container.streams.audio:
                 audio_frames = _read_from_stream(
                     container,
-                    start_pts_sec,
-                    end_pts_sec,
+                    start_pts,
+                    end_pts,
                     pts_unit,
                     container.streams.audio[0],
                     {"audio": 0},
@@ -334,6 +337,10 @@ def read_video(
     if aframes_list:
         aframes = np.concatenate(aframes_list, 1)
         aframes = torch.as_tensor(aframes)
+        if pts_unit == 'sec':
+            start_pts = int(math.floor(start_pts * (1 / audio_timebase)))
+            if end_pts != float("inf"):
+                end_pts = int(math.ceil(end_pts * (1 / audio_timebase)))
         aframes = _align_audio_frames(aframes, audio_frames, start_pts, end_pts)
     else:
         aframes = torch.empty((1, 0), dtype=torch.float32)