Merge branch 'main' of github.com:pytorch/torchcodec into sample_rate

NicolasHug · NicolasHug · commit 7cb2271dbf6f · 2025-03-20T10:09:18.000Z
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.cpp b/src/torchcodec/decoders/_core/VideoDecoder.cpp
@@ -7,6 +7,7 @@
 #include "src/torchcodec/decoders/_core/VideoDecoder.h"
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <iostream>
 #include <limits>
 #include <sstream>
@@ -67,7 +68,7 @@ std::vector<std::string> splitStringWithDelimiters(
 
 VideoDecoder::VideoDecoder(const std::string& videoFilePath, SeekMode seekMode)
     : seekMode_(seekMode) {
-  av_log_set_level(AV_LOG_QUIET);
+  setFFmpegLogLevel();
 
   AVFormatContext* rawContext = nullptr;
   int status =
@@ -86,7 +87,7 @@ VideoDecoder::VideoDecoder(const void* data, size_t length, SeekMode seekMode)
     : seekMode_(seekMode) {
   TORCH_CHECK(data != nullptr, "Video data buffer cannot be nullptr!");
 
-  av_log_set_level(AV_LOG_QUIET);
+  setFFmpegLogLevel();
 
   constexpr int bufferSize = 64 * 1024;
   ioBytesContext_.reset(new AVIOBytesContext(data, length, bufferSize));
@@ -206,6 +207,39 @@ void VideoDecoder::initializeDecoder() {
   initialized_ = true;
 }
 
+void VideoDecoder::setFFmpegLogLevel() {
+  auto logLevel = AV_LOG_QUIET;
+  const char* logLevelEnv = std::getenv("TORCHCODEC_FFMPEG_LOG_LEVEL");
+  if (logLevelEnv != nullptr) {
+    if (std::strcmp(logLevelEnv, "QUIET") == 0) {
+      logLevel = AV_LOG_QUIET;
+    } else if (std::strcmp(logLevelEnv, "PANIC") == 0) {
+      logLevel = AV_LOG_PANIC;
+    } else if (std::strcmp(logLevelEnv, "FATAL") == 0) {
+      logLevel = AV_LOG_FATAL;
+    } else if (std::strcmp(logLevelEnv, "ERROR") == 0) {
+      logLevel = AV_LOG_ERROR;
+    } else if (std::strcmp(logLevelEnv, "WARNING") == 0) {
+      logLevel = AV_LOG_WARNING;
+    } else if (std::strcmp(logLevelEnv, "INFO") == 0) {
+      logLevel = AV_LOG_INFO;
+    } else if (std::strcmp(logLevelEnv, "VERBOSE") == 0) {
+      logLevel = AV_LOG_VERBOSE;
+    } else if (std::strcmp(logLevelEnv, "DEBUG") == 0) {
+      logLevel = AV_LOG_DEBUG;
+    } else if (std::strcmp(logLevelEnv, "TRACE") == 0) {
+      logLevel = AV_LOG_TRACE;
+    } else {
+      TORCH_CHECK(
+          false,
+          "Invalid TORCHCODEC_FFMPEG_LOG_LEVEL: ",
+          logLevelEnv,
+          ". Use e.g. 'QUIET', 'PANIC', 'VERBOSE', etc.");
+    }
+  }
+  av_log_set_level(logLevel);
+}
+
 int VideoDecoder::getBestStreamIndex(AVMediaType mediaType) {
   AVCodecOnlyUseForCallingAVFindBestStream avCodec = nullptr;
   int streamIndex =
@@ -1750,7 +1784,10 @@ void VideoDecoder::createSwrContext(
   TORCH_CHECK(
       status == AVSUCCESS,
       "Couldn't initialize SwrContext: ",
-      getFFMPEGErrorStringFromErrorCode(status));
+      getFFMPEGErrorStringFromErrorCode(status),
+      ". If the error says 'Invalid argument', it's likely that you are using "
+      "a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
+      "valid scenarios. Try to upgrade FFmpeg?");
   streamInfo.swrContext.reset(swrContext);
 }
 
diff --git a/src/torchcodec/decoders/_core/VideoDecoder.h b/src/torchcodec/decoders/_core/VideoDecoder.h
@@ -363,6 +363,7 @@ class VideoDecoder {
   // --------------------------------------------------------------------------
 
   void initializeDecoder();
+  void setFFmpegLogLevel();
   // --------------------------------------------------------------------------
   // DECODING APIS AND RELATED UTILS
   // --------------------------------------------------------------------------
diff --git a/test/decoders/test_decoders.py b/test/decoders/test_decoders.py
@@ -25,6 +25,7 @@
     NASA_AUDIO,
     NASA_AUDIO_MP3,
     NASA_VIDEO,
+    SINE_MONO_S16,
     SINE_MONO_S32,
     SINE_MONO_S32_44100,
     SINE_MONO_S32_8000,
@@ -1155,3 +1156,24 @@ def test_sample_rate_conversion(self, start_seconds, stop_seconds):
             atol=atol,
             rtol=rtol,
         )
+
+    def test_s16_ffmpeg4_bug(self):
+        # s16 fails on FFmpeg4 but can be decoded on other versions.
+        # Debugging logs show that we're hitting:
+        # [SWR @ 0x560a7abdaf80] Input channel count and layout are unset
+        # which seems to point to:
+        # https://github.com/FFmpeg/FFmpeg/blob/40a6963fbd0c47be358a3760480180b7b532e1e9/libswresample/swresample.c#L293-L305
+        # ¯\_(ツ)_/¯
+
+        asset = SINE_MONO_S16
+        decoder = AudioDecoder(asset.path)
+        assert decoder.metadata.sample_rate == asset.sample_rate
+        assert decoder.metadata.sample_format == asset.sample_format
+
+        cm = (
+            pytest.raises(RuntimeError, match="Invalid argument")
+            if get_ffmpeg_major_version() == 4
+            else contextlib.nullcontext()
+        )
+        with cm:
+            decoder.get_samples_played_in_range(start_seconds=0)
diff --git a/test/resources/sine_mono_s16.wav b/test/resources/sine_mono_s16.wav
diff --git a/test/resources/sine_mono_s16.wav.stream0.all_frames_info.json b/test/resources/sine_mono_s16.wav.stream0.all_frames_info.json
@@ -0,0 +1,130 @@
+[
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.000000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.128000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.256000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.384000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.512000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.640000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.768000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "0.896000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.024000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.152000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.280000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.408000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.536000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.664000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.792000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "1.920000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.048000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.176000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.304000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.432000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.560000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.688000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.816000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "2.944000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.072000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.200000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.328000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.456000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.584000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.712000"
+  },
+  {
+    "duration_time": "0.128000",
+    "pts_time": "3.840000"
+  },
+  {
+    "duration_time": "0.032000",
+    "pts_time": "3.968000"
+  }
+]
diff --git a/test/utils.py b/test/utils.py
@@ -496,6 +496,23 @@ def sample_format(self) -> str:
     },
 )
 
+# Same sample rate as SINE_MONO_S32, but encoded as s16 instead of s32. Generated with:
+# ffmpeg -i test/resources/sine_mono_s32.wav -ar 16000 -c:a pcm_s16le test/resources/sine_mono_s16.wav
+SINE_MONO_S16 = TestAudio(
+    filename="sine_mono_s16.wav",
+    default_stream_index=0,
+    frames={},  # Automatically loaded from json file
+    stream_infos={
+        0: TestAudioStreamInfo(
+            sample_rate=16_000,
+            num_channels=1,
+            duration_seconds=4,
+            num_frames=63,
+            sample_format="s16",
+        )
+    },
+)
+
 H265_VIDEO = TestVideo(
     filename="h265_video.mp4",
     default_stream_index=0,