Add AudioEffector (#3163)

mthrok · facebook-github-bot · commit ea3b60a95a2d · 2023-03-31T09:51:01.000-07:00
Summary: This commit adds a new feature AudioEffector, which can be used to apply various effects and codecs to waveforms in Tensor. Under the hood it uses StreamWriter and StreamReader to apply filters and encode/decode. This is going to replace the deprecated `apply_codec` and `apply_sox_effect_tensor` functions. It can also perform online, chunk-by-chunk filtering. Tutorial to follow. closes #3161 Pull Request resolved: #3163 Differential Revision: D44576660 Pulled By: mthrok fbshipit-source-id: 42097e758598c098313ff5a6b9563183604d6842
diff --git a/docs/source/_templates/autosummary/io_class.rst b/docs/source/_templates/autosummary/io_class.rst
@@ -42,7 +42,6 @@ Methods
    not item.startswith('_')
    and item not in inherited_members
    and item not in attributes
-   and item != "CodecConfig"
    %}
 
 {{ item | underline("~") }}
@@ -56,11 +55,12 @@ Methods
 {%- endif %}
 
 
-{%- if name == "StreamReader" %}
+{%- if name in ["StreamReader", "StreamWriter"] %}
 
 Support Structures
 ------------------
 
+{%- if name == "StreamReader" %}
 {%- for item in [
     "ChunkTensor",
     "SourceStream",
@@ -77,15 +77,14 @@ Support Structures
    :members:
 
 {%- endfor %}
-{%- elif name == "StreamWriter" %}
 
-Support Structures
-------------------
+{%- elif name == "StreamWriter" %}
 
 CodecConfig
 ~~~~~~~~~~~
 
-.. autoclass:: torchaudio.io::StreamWriter.CodecConfig()
+.. autoclass:: torchaudio.io::CodecConfig
    :members:
 
 {%- endif %}
+{%- endif %}
diff --git a/docs/source/io.rst b/docs/source/io.rst
@@ -12,6 +12,7 @@ torchaudio.io
 
    StreamReader
    StreamWriter
+   AudioEffector
    play_audio
 
 .. rubric:: Tutorials using ``torchaudio.io``
diff --git a/test/torchaudio_unittest/io/common.py b/test/torchaudio_unittest/io/common.py
@@ -0,0 +1,16 @@
+import torchaudio
+
+
+# If FFmpeg is 4.1 or older
+# Tests that checks the number of output samples from OPUS fails
+# They work on 4.2+
+# Probably this commit fixed it.
+# https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
+def lt42():
+    ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
+    # 5.1 libavcodec     59. 18.100
+    # 4.4 libavcodec     58.134.100
+    # 4.3 libavcodec     58. 91.100
+    # 4.2 libavcodec     58. 54.100
+    # 4.1 libavcodec     58. 35.100
+    return ver[0] < 59 and ver[1] < 54
diff --git a/test/torchaudio_unittest/io/effector_test.py b/test/torchaudio_unittest/io/effector_test.py
@@ -0,0 +1,82 @@
+from parameterized import parameterized
+
+from torchaudio.io import AudioEffector
+from torchaudio_unittest.common_utils import get_sinusoid, skipIfNoFFmpeg, TorchaudioTestCase
+
+from .common import lt42
+
+
+@skipIfNoFFmpeg
+class EffectorTest(TorchaudioTestCase):
+    def test_null(self):
+        """No effect and codec will return the same result"""
+        sample_rate = 8000
+        frames_per_chunk = 256
+
+        effector = AudioEffector(effect=None, format=None)
+        original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
+
+        # one-go
+        output = effector.apply(original, sample_rate)
+        self.assertEqual(original, output)
+        # streaming
+        for i, chunk in enumerate(effector.stream(original, sample_rate, frames_per_chunk)):
+            start = i * frames_per_chunk
+            end = (i + 1) * frames_per_chunk
+            self.assertEqual(original[start:end, :], chunk)
+
+    @parameterized.expand(
+        [
+            ("ogg", "flac"),  # flac only supports s16 and s32
+            ("ogg", "opus"),  # opus only supports 48k Hz
+            ("ogg", "vorbis"),  # vorbis only supports stereo
+            ("wav", None),
+            ("wav", "pcm_u8"),
+            ("mp3", None),
+        ]
+    )
+    def test_formats(self, format, encoder):
+        """Formats (some with restrictions) just work without an issue in effector"""
+        sample_rate = 8000
+
+        effector = AudioEffector(format=format, encoder=encoder)
+        original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
+
+        output = effector.apply(original, sample_rate)
+
+        # On 4.1 OPUS produces 8020 samples (extra 20)
+        # this has been fixed on 4.2+
+        if encoder == "opus" and lt42():
+            return
+
+        self.assertEqual(original.shape, output.shape)
+
+        # Note
+        # MP3 adds padding which cannot be removed when the encoded data is written to
+        # file-like object without seek method.
+        # The number of padding is retrievable as `AVCoedcContext::initial_padding`
+        # https://ffmpeg.org/doxygen/4.1/structAVCodecContext.html#a8f95550ce04f236e9915516d04d3d1ab
+        # but this is not exposed yet.
+        # These "priming" samples have negative time stamp, so we can also add logic
+        # to discard them at decoding, however, as far as I checked, when data is loaded
+        # with StreamReader, the time stamp is reset. I tried options like avoid_negative_ts,
+        # https://ffmpeg.org/ffmpeg-formats.html
+        # but it made no difference. Perhaps this is because the information about negative
+        # timestamp is only available at encoding side, and it presumably is written to
+        # header file, but it is not happening somehow with file-like object.
+        # Need to investigate more to remove MP3 padding
+        if format == "mp3":
+            return
+
+        for chunk in effector.stream(original, sample_rate, frames_per_chunk=original.size(0)):
+            self.assertEqual(original.shape, chunk.shape)
+
+    @parameterized.expand([("loudnorm=I=-16:LRA=11:TP=-1.5",), ("volume=2",)])
+    def test_effect(self, effect):
+        sample_rate = 8000
+
+        effector = AudioEffector(effect=effect)
+        original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
+
+        output = effector.apply(original, sample_rate)
+        self.assertEqual(original.shape, output.shape)
diff --git a/test/torchaudio_unittest/io/stream_writer_test.py b/test/torchaudio_unittest/io/stream_writer_test.py
@@ -16,9 +16,10 @@
     TempDirMixin,
     TorchaudioTestCase,
 )
+from .common import lt42
 
 if is_ffmpeg_available():
-    from torchaudio.io import StreamReader, StreamWriter
+    from torchaudio.io import CodecConfig, StreamReader, StreamWriter
 
 
 def get_audio_chunk(fmt, sample_rate, num_channels):
@@ -380,20 +381,11 @@ def test_audio_num_frames_lossy(self, ext, num_channels, sample_rate):
         s.process_all_packets()
         (saved,) = s.pop_chunks()
 
-        # This test fails for OPUS if FFmpeg is 4.1, but it passes for 4.2+
-        # 4.1 produces 48312 samples (extra 312)
-        # Probably this commit fixes it.
-        # https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
-        # TODO: issue warning if 4.1?
-        if ext == "opus":
-            ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
-            # 5.1 libavcodec     59. 18.100
-            # 4.4 libavcodec     58.134.100
-            # 4.3 libavcodec     58. 91.100
-            # 4.2 libavcodec     58. 54.100
-            # 4.1 libavcodec     58. 35.100
-            if ver[0] < 59 and ver[1] < 54:
-                return
+        # On 4.1 OPUS produces 48312 samples (extra 312)
+        # this has been fixed on 4.2+
+        # TODO: issue warning if on 4.1?
+        if ext == "opus" and lt42():
+            return
         self.assertEqual(saved.shape, data.shape)
 
     def test_preserve_fps(self):
@@ -534,7 +526,7 @@ def test_codec_config(self):
         # Write data
         dst = self.get_temp_path(filename)
         writer = torchaudio.io.StreamWriter(dst=dst, format=ext)
-        codec_config = torchaudio.io.StreamWriter.CodecConfig(bit_rate=198_000, compression_level=3)
+        codec_config = CodecConfig(bit_rate=198_000, compression_level=3)
         writer.add_audio_stream(sample_rate=sample_rate, num_channels=num_channels, codec_config=codec_config)
 
         audio = torch.zeros((8000, 2))
@@ -553,7 +545,7 @@ def write_audio(buffer, bit_rate):
             writer.add_audio_stream(
                 sample_rate=sample_rate,
                 num_channels=num_channels,
-                codec_config=torchaudio.io.StreamWriter.CodecConfig(bit_rate=bit_rate),
+                codec_config=CodecConfig(bit_rate=bit_rate),
             )
 
             with writer.open():
diff --git a/torchaudio/io/__init__.py b/torchaudio/io/__init__.py
@@ -1,10 +1,13 @@
+from ._effector import AudioEffector
 from ._playback import play_audio
 from ._stream_reader import StreamReader
-from ._stream_writer import StreamWriter
+from ._stream_writer import CodecConfig, StreamWriter
 
 
 __all__ = [
+    "AudioEffector",
     "StreamReader",
     "StreamWriter",
+    "CodecConfig",
     "play_audio",
 ]
diff --git a/torchaudio/io/_effector.py b/torchaudio/io/_effector.py
diff --git a/torchaudio/io/_stream_writer.py b/torchaudio/io/_stream_writer.py