Skip to content

Commit ea3b60a

Browse files
mthrokfacebook-github-bot
authored andcommitted
Add AudioEffector (#3163)
Summary: This commit adds a new feature AudioEffector, which can be used to apply various effects and codecs to waveforms in Tensor. Under the hood it uses StreamWriter and StreamReader to apply filters and encode/decode. This is going to replace the deprecated `apply_codec` and `apply_sox_effect_tensor` functions. It can also perform online, chunk-by-chunk filtering. Tutorial to follow. closes #3161 Pull Request resolved: #3163 Differential Revision: D44576660 Pulled By: mthrok fbshipit-source-id: 42097e758598c098313ff5a6b9563183604d6842
1 parent bb75caa commit ea3b60a

File tree

8 files changed

+479
-50
lines changed

8 files changed

+479
-50
lines changed

docs/source/_templates/autosummary/io_class.rst

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ Methods
4242
not item.startswith('_')
4343
and item not in inherited_members
4444
and item not in attributes
45-
and item != "CodecConfig"
4645
%}
4746

4847
{{ item | underline("~") }}
@@ -56,11 +55,12 @@ Methods
5655
{%- endif %}
5756

5857

59-
{%- if name == "StreamReader" %}
58+
{%- if name in ["StreamReader", "StreamWriter"] %}
6059

6160
Support Structures
6261
------------------
6362

63+
{%- if name == "StreamReader" %}
6464
{%- for item in [
6565
"ChunkTensor",
6666
"SourceStream",
@@ -77,15 +77,14 @@ Support Structures
7777
:members:
7878

7979
{%- endfor %}
80-
{%- elif name == "StreamWriter" %}
8180

82-
Support Structures
83-
------------------
81+
{%- elif name == "StreamWriter" %}
8482

8583
CodecConfig
8684
~~~~~~~~~~~
8785

88-
.. autoclass:: torchaudio.io::StreamWriter.CodecConfig()
86+
.. autoclass:: torchaudio.io::CodecConfig
8987
:members:
9088

9189
{%- endif %}
90+
{%- endif %}

docs/source/io.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ torchaudio.io
1212

1313
StreamReader
1414
StreamWriter
15+
AudioEffector
1516
play_audio
1617

1718
.. rubric:: Tutorials using ``torchaudio.io``

test/torchaudio_unittest/io/common.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import torchaudio
2+
3+
4+
# If FFmpeg is 4.1 or older
5+
# Tests that checks the number of output samples from OPUS fails
6+
# They work on 4.2+
7+
# Probably this commit fixed it.
8+
# https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
9+
def lt42():
10+
ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
11+
# 5.1 libavcodec 59. 18.100
12+
# 4.4 libavcodec 58.134.100
13+
# 4.3 libavcodec 58. 91.100
14+
# 4.2 libavcodec 58. 54.100
15+
# 4.1 libavcodec 58. 35.100
16+
return ver[0] < 59 and ver[1] < 54
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from parameterized import parameterized
2+
3+
from torchaudio.io import AudioEffector
4+
from torchaudio_unittest.common_utils import get_sinusoid, skipIfNoFFmpeg, TorchaudioTestCase
5+
6+
from .common import lt42
7+
8+
9+
@skipIfNoFFmpeg
10+
class EffectorTest(TorchaudioTestCase):
11+
def test_null(self):
12+
"""No effect and codec will return the same result"""
13+
sample_rate = 8000
14+
frames_per_chunk = 256
15+
16+
effector = AudioEffector(effect=None, format=None)
17+
original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
18+
19+
# one-go
20+
output = effector.apply(original, sample_rate)
21+
self.assertEqual(original, output)
22+
# streaming
23+
for i, chunk in enumerate(effector.stream(original, sample_rate, frames_per_chunk)):
24+
start = i * frames_per_chunk
25+
end = (i + 1) * frames_per_chunk
26+
self.assertEqual(original[start:end, :], chunk)
27+
28+
@parameterized.expand(
29+
[
30+
("ogg", "flac"), # flac only supports s16 and s32
31+
("ogg", "opus"), # opus only supports 48k Hz
32+
("ogg", "vorbis"), # vorbis only supports stereo
33+
("wav", None),
34+
("wav", "pcm_u8"),
35+
("mp3", None),
36+
]
37+
)
38+
def test_formats(self, format, encoder):
39+
"""Formats (some with restrictions) just work without an issue in effector"""
40+
sample_rate = 8000
41+
42+
effector = AudioEffector(format=format, encoder=encoder)
43+
original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
44+
45+
output = effector.apply(original, sample_rate)
46+
47+
# On 4.1 OPUS produces 8020 samples (extra 20)
48+
# this has been fixed on 4.2+
49+
if encoder == "opus" and lt42():
50+
return
51+
52+
self.assertEqual(original.shape, output.shape)
53+
54+
# Note
55+
# MP3 adds padding which cannot be removed when the encoded data is written to
56+
# file-like object without seek method.
57+
# The number of padding is retrievable as `AVCoedcContext::initial_padding`
58+
# https://ffmpeg.org/doxygen/4.1/structAVCodecContext.html#a8f95550ce04f236e9915516d04d3d1ab
59+
# but this is not exposed yet.
60+
# These "priming" samples have negative time stamp, so we can also add logic
61+
# to discard them at decoding, however, as far as I checked, when data is loaded
62+
# with StreamReader, the time stamp is reset. I tried options like avoid_negative_ts,
63+
# https://ffmpeg.org/ffmpeg-formats.html
64+
# but it made no difference. Perhaps this is because the information about negative
65+
# timestamp is only available at encoding side, and it presumably is written to
66+
# header file, but it is not happening somehow with file-like object.
67+
# Need to investigate more to remove MP3 padding
68+
if format == "mp3":
69+
return
70+
71+
for chunk in effector.stream(original, sample_rate, frames_per_chunk=original.size(0)):
72+
self.assertEqual(original.shape, chunk.shape)
73+
74+
@parameterized.expand([("loudnorm=I=-16:LRA=11:TP=-1.5",), ("volume=2",)])
75+
def test_effect(self, effect):
76+
sample_rate = 8000
77+
78+
effector = AudioEffector(effect=effect)
79+
original = get_sinusoid(n_channels=3, sample_rate=sample_rate, channels_first=False)
80+
81+
output = effector.apply(original, sample_rate)
82+
self.assertEqual(original.shape, output.shape)

test/torchaudio_unittest/io/stream_writer_test.py

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
TempDirMixin,
1717
TorchaudioTestCase,
1818
)
19+
from .common import lt42
1920

2021
if is_ffmpeg_available():
21-
from torchaudio.io import StreamReader, StreamWriter
22+
from torchaudio.io import CodecConfig, StreamReader, StreamWriter
2223

2324

2425
def get_audio_chunk(fmt, sample_rate, num_channels):
@@ -380,20 +381,11 @@ def test_audio_num_frames_lossy(self, ext, num_channels, sample_rate):
380381
s.process_all_packets()
381382
(saved,) = s.pop_chunks()
382383

383-
# This test fails for OPUS if FFmpeg is 4.1, but it passes for 4.2+
384-
# 4.1 produces 48312 samples (extra 312)
385-
# Probably this commit fixes it.
386-
# https://github.com/FFmpeg/FFmpeg/commit/18aea7bdd96b320a40573bccabea56afeccdd91c
387-
# TODO: issue warning if 4.1?
388-
if ext == "opus":
389-
ver = torchaudio.utils.ffmpeg_utils.get_versions()["libavcodec"]
390-
# 5.1 libavcodec 59. 18.100
391-
# 4.4 libavcodec 58.134.100
392-
# 4.3 libavcodec 58. 91.100
393-
# 4.2 libavcodec 58. 54.100
394-
# 4.1 libavcodec 58. 35.100
395-
if ver[0] < 59 and ver[1] < 54:
396-
return
384+
# On 4.1 OPUS produces 48312 samples (extra 312)
385+
# this has been fixed on 4.2+
386+
# TODO: issue warning if on 4.1?
387+
if ext == "opus" and lt42():
388+
return
397389
self.assertEqual(saved.shape, data.shape)
398390

399391
def test_preserve_fps(self):
@@ -534,7 +526,7 @@ def test_codec_config(self):
534526
# Write data
535527
dst = self.get_temp_path(filename)
536528
writer = torchaudio.io.StreamWriter(dst=dst, format=ext)
537-
codec_config = torchaudio.io.StreamWriter.CodecConfig(bit_rate=198_000, compression_level=3)
529+
codec_config = CodecConfig(bit_rate=198_000, compression_level=3)
538530
writer.add_audio_stream(sample_rate=sample_rate, num_channels=num_channels, codec_config=codec_config)
539531

540532
audio = torch.zeros((8000, 2))
@@ -553,7 +545,7 @@ def write_audio(buffer, bit_rate):
553545
writer.add_audio_stream(
554546
sample_rate=sample_rate,
555547
num_channels=num_channels,
556-
codec_config=torchaudio.io.StreamWriter.CodecConfig(bit_rate=bit_rate),
548+
codec_config=CodecConfig(bit_rate=bit_rate),
557549
)
558550

559551
with writer.open():

torchaudio/io/__init__.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from ._effector import AudioEffector
12
from ._playback import play_audio
23
from ._stream_reader import StreamReader
3-
from ._stream_writer import StreamWriter
4+
from ._stream_writer import CodecConfig, StreamWriter
45

56

67
__all__ = [
8+
"AudioEffector",
79
"StreamReader",
810
"StreamWriter",
11+
"CodecConfig",
912
"play_audio",
1013
]

0 commit comments

Comments
 (0)