From 2e25279e5382f591272f2b5d01286cc11d976f79 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 18 Jul 2025 19:57:20 +0000 Subject: [PATCH 01/35] Add torchcodec mock with wav loading and saving --- test/torchcodec/decoders.py | 17 +++++++++++++++++ test/torchcodec/encoders.py | 10 ++++++++++ 2 files changed, 27 insertions(+) create mode 100644 test/torchcodec/decoders.py create mode 100644 test/torchcodec/encoders.py diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py new file mode 100644 index 0000000000..94f2d8c8c1 --- /dev/null +++ b/test/torchcodec/decoders.py @@ -0,0 +1,17 @@ +import test.torchaudio_unittest.common_utils.wav_utils as wav_utils + +class AudioDecoder: + def __init__(self, uri): + self.uri = uri + + def get_all_samples(self): + return wav_utils.load_wav(self.uri) + + +class AudioEncoder: + def __init__(self, data, sample_rate): + self.data = data + self.sample_rate = sample_rate + + def to_file(self, uri, bit_rate=None): + return wav_utils.save_wav(uri, self.data, self.sample_rate) diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py new file mode 100644 index 0000000000..5e9cc54968 --- /dev/null +++ b/test/torchcodec/encoders.py @@ -0,0 +1,10 @@ +import torchaudio_unittest.common_utils.wav_utils as wav_utils + +class AudioEncoder: + def __init__(self, data, sample_rate): + print("BEING CALLED") + self.data = data + self.sample_rate = sample_rate + + def to_file(self, uri, bit_rate=None): + return wav_utils.save_wav(uri, self.data, self.sample_rate) From dd90ff3dc707c734df761979df9f80153fde45f1 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Mon, 11 Aug 2025 21:55:18 +0000 Subject: [PATCH 02/35] WIP --- .../_templates/autosummary/torio_io_class.rst | 90 -- docs/source/libtorio.rst | 17 - docs/source/libtorio.stream_reader.rst | 155 --- docs/source/torio.io.rst | 30 - docs/source/torio.rst | 26 - docs/source/torio.utils.rst | 25 - src/libtorio/ffmpeg/CMakeLists.txt | 93 -- src/libtorio/ffmpeg/README.md | 134 --- src/libtorio/ffmpeg/ffmpeg.cpp | 148 --- src/libtorio/ffmpeg/ffmpeg.h | 214 ---- src/libtorio/ffmpeg/filter_graph.cpp | 241 ----- src/libtorio/ffmpeg/filter_graph.h | 88 -- src/libtorio/ffmpeg/hw_context.cpp | 40 - src/libtorio/ffmpeg/hw_context.h | 11 - src/libtorio/ffmpeg/pybind/pybind.cpp | 469 --------- .../stream_reader/buffer/chunked_buffer.cpp | 129 --- .../stream_reader/buffer/chunked_buffer.h | 33 - .../stream_reader/buffer/unchunked_buffer.cpp | 33 - .../stream_reader/buffer/unchunked_buffer.h | 23 - .../ffmpeg/stream_reader/conversion.cpp | 630 ----------- .../ffmpeg/stream_reader/conversion.h | 129 --- .../ffmpeg/stream_reader/packet_buffer.cpp | 20 - .../ffmpeg/stream_reader/packet_buffer.h | 16 - .../ffmpeg/stream_reader/post_process.cpp | 620 ----------- .../ffmpeg/stream_reader/post_process.h | 34 - .../ffmpeg/stream_reader/stream_processor.cpp | 396 ------- .../ffmpeg/stream_reader/stream_processor.h | 107 -- .../ffmpeg/stream_reader/stream_reader.cpp | 612 ----------- .../ffmpeg/stream_reader/stream_reader.h | 399 ------- src/libtorio/ffmpeg/stream_reader/typedefs.h | 165 --- .../ffmpeg/stream_writer/encode_process.cpp | 976 ----------------- .../ffmpeg/stream_writer/encode_process.h | 67 -- src/libtorio/ffmpeg/stream_writer/encoder.cpp | 62 -- src/libtorio/ffmpeg/stream_writer/encoder.h | 30 - .../ffmpeg/stream_writer/packet_writer.cpp | 36 - .../ffmpeg/stream_writer/packet_writer.h | 16 - .../ffmpeg/stream_writer/stream_writer.cpp | 390 ------- .../ffmpeg/stream_writer/stream_writer.h | 344 ------ .../ffmpeg/stream_writer/tensor_converter.cpp | 497 --------- .../ffmpeg/stream_writer/tensor_converter.h | 95 -- src/libtorio/ffmpeg/stream_writer/types.h | 19 - src/torio/__init__.py | 8 - src/torio/_extension/__init__.py | 13 - src/torio/_extension/utils.py | 147 --- src/torio/io/__init__.py | 9 - src/torio/io/_streaming_media_decoder.py | 977 ------------------ src/torio/io/_streaming_media_encoder.py | 502 --------- src/torio/lib/__init__.py | 0 src/torio/utils/__init__.py | 4 - src/torio/utils/ffmpeg_utils.py | 275 ----- tools/setup_helpers/extension.py | 20 - 51 files changed, 9614 deletions(-) delete mode 100644 docs/source/_templates/autosummary/torio_io_class.rst delete mode 100644 docs/source/libtorio.rst delete mode 100644 docs/source/libtorio.stream_reader.rst delete mode 100644 docs/source/torio.io.rst delete mode 100644 docs/source/torio.rst delete mode 100644 docs/source/torio.utils.rst delete mode 100644 src/libtorio/ffmpeg/CMakeLists.txt delete mode 100644 src/libtorio/ffmpeg/README.md delete mode 100644 src/libtorio/ffmpeg/ffmpeg.cpp delete mode 100644 src/libtorio/ffmpeg/ffmpeg.h delete mode 100644 src/libtorio/ffmpeg/filter_graph.cpp delete mode 100644 src/libtorio/ffmpeg/filter_graph.h delete mode 100644 src/libtorio/ffmpeg/hw_context.cpp delete mode 100644 src/libtorio/ffmpeg/hw_context.h delete mode 100644 src/libtorio/ffmpeg/pybind/pybind.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/conversion.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/conversion.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/packet_buffer.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/packet_buffer.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/post_process.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/post_process.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/stream_processor.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/stream_processor.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/stream_reader.cpp delete mode 100644 src/libtorio/ffmpeg/stream_reader/stream_reader.h delete mode 100644 src/libtorio/ffmpeg/stream_reader/typedefs.h delete mode 100644 src/libtorio/ffmpeg/stream_writer/encode_process.cpp delete mode 100644 src/libtorio/ffmpeg/stream_writer/encode_process.h delete mode 100644 src/libtorio/ffmpeg/stream_writer/encoder.cpp delete mode 100644 src/libtorio/ffmpeg/stream_writer/encoder.h delete mode 100644 src/libtorio/ffmpeg/stream_writer/packet_writer.cpp delete mode 100644 src/libtorio/ffmpeg/stream_writer/packet_writer.h delete mode 100644 src/libtorio/ffmpeg/stream_writer/stream_writer.cpp delete mode 100644 src/libtorio/ffmpeg/stream_writer/stream_writer.h delete mode 100644 src/libtorio/ffmpeg/stream_writer/tensor_converter.cpp delete mode 100644 src/libtorio/ffmpeg/stream_writer/tensor_converter.h delete mode 100644 src/libtorio/ffmpeg/stream_writer/types.h delete mode 100644 src/torio/__init__.py delete mode 100644 src/torio/_extension/__init__.py delete mode 100644 src/torio/_extension/utils.py delete mode 100644 src/torio/io/__init__.py delete mode 100644 src/torio/io/_streaming_media_decoder.py delete mode 100644 src/torio/io/_streaming_media_encoder.py delete mode 100644 src/torio/lib/__init__.py delete mode 100644 src/torio/utils/__init__.py delete mode 100644 src/torio/utils/ffmpeg_utils.py diff --git a/docs/source/_templates/autosummary/torio_io_class.rst b/docs/source/_templates/autosummary/torio_io_class.rst deleted file mode 100644 index f83820ca6d..0000000000 --- a/docs/source/_templates/autosummary/torio_io_class.rst +++ /dev/null @@ -1,90 +0,0 @@ -.. - autogenerated from source/_templates/autosummary/torio_io_class.rst - -{#- - ################################################################################ - # autosummary template for torio.io module - # Since StreamingMediaDecoder/StreamingMediaEncoder have many methods/properties, - # we want to list them up in the table of contents. - # The default class template does not do this, so we use custom one here. - ################################################################################ -#} - -{{ name | underline }} - -.. autoclass:: {{ fullname }} - -{%- if attributes %} - -Properties ----------- - -{%- for item in attributes %} -{%- if not item.startswith('_') and item not in inherited_members %} - -{{ item | underline("~") }} - -.. container:: py attribute - - .. autoproperty:: {{[fullname, item] | join('.')}} - -{%- endif %} -{%- endfor %} -{%- endif %} - -{%- if members %} - -Methods -------- - -{%- for item in members %} -{%- if - not item.startswith('_') - and item not in inherited_members - and item not in attributes - %} - -{{ item | underline("~") }} - -.. container:: py attribute - - .. automethod:: {{[fullname, item] | join('.')}} - -{%- endif %} -{%- endfor %} -{%- endif %} - - -{%- if name in ["StreamingMediaDecoder", "StreamingMediaEncoder"] %} - -Support Structures ------------------- - -{%- if name == "StreamingMediaDecoder" %} -{%- for item in [ - "ChunkTensor", - "SourceStream", - "SourceAudioStream", - "SourceVideoStream", - "OutputStream", - "OutputAudioStream", - "OutputVideoStream", -] %} - -{{ item | underline("~") }} - -.. autoclass:: torio.io._streaming_media_decoder::{{item}}() - :members: - -{%- endfor %} - -{%- elif name == "StreamingMediaEncoder" %} - -CodecConfig -~~~~~~~~~~~ - -.. autoclass:: torio.io::CodecConfig - :members: - -{%- endif %} -{%- endif %} diff --git a/docs/source/libtorio.rst b/docs/source/libtorio.rst deleted file mode 100644 index d96296e21c..0000000000 --- a/docs/source/libtorio.rst +++ /dev/null @@ -1,17 +0,0 @@ -libtorio -======== - - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - ``torio`` is deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. - -.. toctree:: - libtorio.stream_reader - libtorio.stream_writer diff --git a/docs/source/libtorio.stream_reader.rst b/docs/source/libtorio.stream_reader.rst deleted file mode 100644 index e59419a801..0000000000 --- a/docs/source/libtorio.stream_reader.rst +++ /dev/null @@ -1,155 +0,0 @@ - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - ``torio`` is deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. - - -.. note:: - The top-level namespace has been changed from ``torchaudio`` to ``torio``. - ``StreamReader`` has been renamed to ``StreamingMediaDecoder``. - - -torio::io::StreamingMediaDecoder -================================ - -``StreamingMediaDecoder`` is the implementation used by Python equivalent and provides similar interface. -When working with custom I/O, such as in-memory data, ``StreamingMediaDecoderCustomIO`` class can be used. - -Both classes have the same methods defined, so their usages are the same. - -Constructors ------------- - -StreamingMediaDecoder -^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenclass:: torio::io::StreamingMediaDecoder - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::StreamingMediaDecoder(const std::string &src, const std::optional &format = {}, const c10::optional &option = {}) - -StreamingMediaDecoderCustomIO -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. doxygenclass:: torio::io::StreamingMediaDecoderCustomIO - -.. doxygenfunction:: torio::io::StreamingMediaDecoderCustomIO::StreamingMediaDecoderCustomIO - -Query Methods -------------- - -find_best_audio_stream -^^^^^^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::find_best_audio_stream - -find_best_video_stream -^^^^^^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::find_best_video_stream - -get_metadata -^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::get_metadata - -num_src_streams -^^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::num_src_streams - -get_src_stream_info -^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::get_src_stream_info - -num_out_streams -^^^^^^^^^^^^^^^ - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::num_out_streams - -get_out_stream_info -^^^^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::get_out_stream_info - -is_buffer_ready -^^^^^^^^^^^^^^^ - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::is_buffer_ready - -Configure Methods ------------------ - -add_audio_stream -^^^^^^^^^^^^^^^^ - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::add_audio_stream - -add_video_stream -^^^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::add_video_stream - -remove_stream -^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::remove_stream - -Stream Methods -^^^^^^^^^^^^^^ - -seek -^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::seek - -process_packet -^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::process_packet() - -process_packet_block -^^^^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::process_packet_block - -process_all_packets -^^^^^^^^^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::process_all_packets - -fill_buffer -^^^^^^^^^^^ -.. doxygenfunction:: torio::io::StreamingMediaDecoder::fill_buffer - -Retrieval Methods ------------------ - -pop_chunks -^^^^^^^^^^ - -.. doxygenfunction:: torio::io::StreamingMediaDecoder::pop_chunks - - -Support Structures ------------------- - -Chunk -^^^^^ - -.. container:: py attribute - - .. doxygenstruct:: torio::io::Chunk - :members: - -SrcStreaminfo -^^^^^^^^^^^^^ - -.. container:: py attribute - - .. doxygenstruct:: torio::io::SrcStreamInfo - :members: - -OutputStreaminfo -^^^^^^^^^^^^^^^^ - -.. container:: py attribute - - .. doxygenstruct:: torio::io::OutputStreamInfo - :members: diff --git a/docs/source/torio.io.rst b/docs/source/torio.io.rst deleted file mode 100644 index eb41c71259..0000000000 --- a/docs/source/torio.io.rst +++ /dev/null @@ -1,30 +0,0 @@ -.. py:module:: torio.io - -torio.io -======== - -.. currentmodule:: torio.io - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - ``torio`` is deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/torio_io_class.rst - - StreamingMediaDecoder - StreamingMediaEncoder - -.. rubric:: Tutorials using ``torio.io`` - -.. minigallery:: torio.io - -.. minigallery:: torchaudio.io diff --git a/docs/source/torio.rst b/docs/source/torio.rst deleted file mode 100644 index 1426603e52..0000000000 --- a/docs/source/torio.rst +++ /dev/null @@ -1,26 +0,0 @@ -.. py:module:: torio - -torio -===== - -.. currentmodule:: torio.io - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - ``torio`` is deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. - -``torio`` is an alternative top-level module for I/O features. It is the extraction of the core implementation of I/O feature of ``torchaudio``. - -If you want to use the multimedia processing features, but do not want to depend on the entire ``torchaudio`` package, you can use ``torio``. - -.. note:: - - Currently, ``torio`` is distributed alongside ``torchaudio``, and there is no stand-alone - procedure to install ``torio`` only. Please refer to https://pytorch.org/get-started/locally/ - for the installation of ``torchaudio``. diff --git a/docs/source/torio.utils.rst b/docs/source/torio.utils.rst deleted file mode 100644 index a30a1db642..0000000000 --- a/docs/source/torio.utils.rst +++ /dev/null @@ -1,25 +0,0 @@ -.. py:module:: torio.utils - -torio.utils -=========== - -``torio.utils`` module contains utility functions to query and configure the global state of third party libraries. - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - ``torio`` is deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. - -.. currentmodule:: torio.utils - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/utils.rst - - ffmpeg_utils diff --git a/src/libtorio/ffmpeg/CMakeLists.txt b/src/libtorio/ffmpeg/CMakeLists.txt deleted file mode 100644 index a5c9e74b31..0000000000 --- a/src/libtorio/ffmpeg/CMakeLists.txt +++ /dev/null @@ -1,93 +0,0 @@ -set( - sources - ffmpeg.cpp - filter_graph.cpp - hw_context.cpp - stream_reader/buffer/chunked_buffer.cpp - stream_reader/buffer/unchunked_buffer.cpp - stream_reader/conversion.cpp - stream_reader/packet_buffer.cpp - stream_reader/post_process.cpp - stream_reader/stream_processor.cpp - stream_reader/stream_reader.cpp - stream_writer/encode_process.cpp - stream_writer/encoder.cpp - stream_writer/packet_writer.cpp - stream_writer/stream_writer.cpp - stream_writer/tensor_converter.cpp - ) - -set( - ext_sources - pybind/pybind.cpp - ) - -if (USE_CUDA) - set( - additional_lib - cuda_deps) -endif() - -if (TARGET ffmpeg) - torio_library( - libtorio_ffmpeg - "${sources}" - "" - "torch;ffmpeg;${additional_lib}" - "" - ) - if (BUILD_TORIO_PYTHON_EXTENSION) - torio_extension( - _torio_ffmpeg - "${ext_sources}" - "" - "libtorio_ffmpeg" - "TORIO_FFMPEG_EXT_NAME=_torio_ffmpeg" - ) - endif() -else() - torio_library( - libtorio_ffmpeg4 - "${sources}" - "" - "torch;ffmpeg4;${additional_lib}" - "" - ) - torio_library( - libtorio_ffmpeg5 - "${sources}" - "" - "torch;ffmpeg5;${additional_lib}" - "" - ) - torio_library( - libtorio_ffmpeg6 - "${sources}" - "" - "torch;ffmpeg6;${additional_lib}" - "" - ) - if (BUILD_TORIO_PYTHON_EXTENSION) - torio_extension( - _torio_ffmpeg4 - "${ext_sources}" - "" - "libtorio_ffmpeg4" - "TORIO_FFMPEG_EXT_NAME=_torio_ffmpeg4" - ) - torio_extension( - _torio_ffmpeg5 - "${ext_sources}" - "" - "libtorio_ffmpeg5" - "TORIO_FFMPEG_EXT_NAME=_torio_ffmpeg5" - ) - torio_extension( - _torio_ffmpeg6 - "${ext_sources}" - "" - "libtorio_ffmpeg6" - "TORIO_FFMPEG_EXT_NAME=_torio_ffmpeg6" - ) - endif () -endif() diff --git a/src/libtorio/ffmpeg/README.md b/src/libtorio/ffmpeg/README.md deleted file mode 100644 index cb77e2ef3b..0000000000 --- a/src/libtorio/ffmpeg/README.md +++ /dev/null @@ -1,134 +0,0 @@ -# FFMpeg binding dev note - -The ffmpeg binding is based on ver 4.1. - -## Learning material - -For understanding the concept of stream processing, some tutorials are useful. - -https://github.com/leandromoreira/ffmpeg-libav-tutorial - -The best way to learn how to use ffmpeg is to look at the official examples. -Practically all the code is re-organization of examples; - -https://ffmpeg.org/doxygen/4.1/examples.html - -## StreamingMediaDecoder Architecture - -The top level class is `StreamingMediaDecoder` class. This class handles the input (via `AVFormatContext*`), and manages `StreamProcessor`s for each stream in the input. - -The `StreamingMediaDecoder` object slices the input data into a series of `AVPacket` objects and it feeds the objects to corresponding `StreamProcessor`s. - -``` - StreamingMediaDecoder -┌─────────────────────────────────────────────────┐ -│ │ -│ AVFormatContext* ┌──► StreamProcessor[0] │ -│ │ │ │ -│ └─────────────┼──► StreamProcessor[1] │ -│ AVPacket* │ │ -│ └──► ... │ -│ │ -└─────────────────────────────────────────────────┘ -``` - -The `StreamProcessor` class is composed of one `Decoder` and multiple of `Sink` objects. - -`Sink` objects correspond to output streams that users set. -`Sink` class is a wrapper `FilterGraph` and `Buffer` classes. - -The `AVPacket*` passed to `StreamProcessor` is first passed to `Decoder`. -`Decoder` generates audio / video frames (`AVFrame`) and pass it to `Sink`s. - -Firstly `Sink` class passes the incoming frame to `FilterGraph`. - -`FilterGraph` is a class based on [`AVFilterGraph` structure](https://ffmpeg.org/doxygen/4.1/structAVFilterGraph.html), -and it can apply various filters. -At minimum, it performs format conversion so that the resuling data is suitable for Tensor representation, -such as YUV to RGB. - -The output `AVFrame` from `FilterGraph` is passed to `Buffer` class, which converts it to Tensor. - -``` - StreamProcessor -┌─────────────────────────────────────────────────────────┐ -│ AVPacket* │ -│ │ │ -│ │ AVFrame* AVFrame* │ -│ └► Decoder ──┬─► FilterGraph ─────► Buffer ───► Tensor │ -│ │ │ -│ ├─► FilterGraph ─────► Buffer ───► Tensor │ -│ │ │ -│ └─► ... │ -│ │ -└─────────────────────────────────────────────────────────┘ -``` - -## Implementation guideline - -### Memory management and object lifecycle - -Ffmpeg uses raw pointers, which needs to be allocated and freed with dedicated functions. -In the binding code, these pointers are encapsulated in a class with RAII semantic and -`std::unique_ptr<>` to guarantee sole ownership. - -**Decoder lifecycle** - -```c++ -// Default construction (no memory allocation) -decoder = Decoder(...); -// Decode -decoder.process_packet(pPacket); -// Retrieve result -decoder.get_frame(pFrame); -// Release resources -decoder::~Decoder(); -``` - -**FilterGraph lifecycle** - -```c++ -// Default construction (no memory allocation) -filter_graph = FilterGraph(AVMEDIA_TYPE_AUDIO); -// Filter configuration -filter_fraph.add_audio_src(..) -filter_fraph.add_sink(..) -filter_fraph.add_process("") -filter_graph.create_filter(); -// Apply filter -fitler_graph.add_frame(pFrame); -// Retrieve result -filter_graph.get_frame(pFrame); -// Release resources -filter_graph::~FilterGraph(); -``` - -**StreamProcessor lifecycle** - -```c++ -// Default construction (no memory allocation) -processor = Processor(...); -// Define the process stream -processor.add_audio_stream(...); -processor.add_audio_stream(...); -// Process the packet -processor.process_packet(pPacket); -// Retrieve result -tensor = processor.get_chunk(...); -// Release resources -processor::~Processor(); -``` - -### ON/OFF semantic and `std::unique_ptr<>` - -Since we want to make some components (such as stream processors and filters) -separately configurable, we introduce states for ON/OFF. -To make the code simple, we use `std::unique_ptr<>`. -`nullptr` means the component is turned off. -This pattern applies to `StreamProcessor` (output streams). - -### Exception and return value - -To report the error during the configuration and initialization of objects, -we use `Exception`. However, throwing errors is expensive during the streaming, -so we use return value for that. diff --git a/src/libtorio/ffmpeg/ffmpeg.cpp b/src/libtorio/ffmpeg/ffmpeg.cpp deleted file mode 100644 index a7e2974876..0000000000 --- a/src/libtorio/ffmpeg/ffmpeg.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include -#include -#include -#include - -namespace torio::io { - -//////////////////////////////////////////////////////////////////////////////// -// AVDictionary -//////////////////////////////////////////////////////////////////////////////// -AVDictionary* get_option_dict(const std::optional& option) { - AVDictionary* opt = nullptr; - if (option) { - for (auto const& [key, value] : option.value()) { - av_dict_set(&opt, key.c_str(), value.c_str(), 0); - } - } - return opt; -} - -void clean_up_dict(AVDictionary* p) { - if (p) { - std::vector unused_keys; - // Check and copy unused keys, clean up the original dictionary - AVDictionaryEntry* t = nullptr; - while ((t = av_dict_get(p, "", t, AV_DICT_IGNORE_SUFFIX))) { - unused_keys.emplace_back(t->key); - } - av_dict_free(&p); - TORCH_CHECK( - unused_keys.empty(), - "Unexpected options: ", - c10::Join(", ", unused_keys)); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// AVFormatContext -//////////////////////////////////////////////////////////////////////////////// -void AVFormatInputContextDeleter::operator()(AVFormatContext* p) { - avformat_close_input(&p); -}; - -AVFormatInputContextPtr::AVFormatInputContextPtr(AVFormatContext* p) - : Wrapper(p) {} - -void AVFormatOutputContextDeleter::operator()(AVFormatContext* p) { - avformat_free_context(p); -}; - -AVFormatOutputContextPtr::AVFormatOutputContextPtr(AVFormatContext* p) - : Wrapper(p) {} - -//////////////////////////////////////////////////////////////////////////////// -// AVIO -//////////////////////////////////////////////////////////////////////////////// -void AVIOContextDeleter::operator()(AVIOContext* p) { - avio_flush(p); - av_freep(&p->buffer); - av_freep(&p); -}; - -AVIOContextPtr::AVIOContextPtr(AVIOContext* p) - : Wrapper(p) {} - -//////////////////////////////////////////////////////////////////////////////// -// AVPacket -//////////////////////////////////////////////////////////////////////////////// -void AVPacketDeleter::operator()(AVPacket* p) { - av_packet_free(&p); -}; - -AVPacketPtr::AVPacketPtr(AVPacket* p) : Wrapper(p) {} - -AVPacketPtr alloc_avpacket() { - AVPacket* p = av_packet_alloc(); - TORCH_CHECK(p, "Failed to allocate AVPacket object."); - return AVPacketPtr{p}; -} - -//////////////////////////////////////////////////////////////////////////////// -// AVPacket - buffer unref -//////////////////////////////////////////////////////////////////////////////// -AutoPacketUnref::AutoPacketUnref(AVPacketPtr& p) : p_(p){}; -AutoPacketUnref::~AutoPacketUnref() { - av_packet_unref(p_); -} -AutoPacketUnref::operator AVPacket*() const { - return p_; -} - -//////////////////////////////////////////////////////////////////////////////// -// AVFrame -//////////////////////////////////////////////////////////////////////////////// -void AVFrameDeleter::operator()(AVFrame* p) { - av_frame_free(&p); -}; - -AVFramePtr::AVFramePtr(AVFrame* p) : Wrapper(p) {} - -AVFramePtr alloc_avframe() { - AVFrame* p = av_frame_alloc(); - TORCH_CHECK(p, "Failed to allocate AVFrame object."); - return AVFramePtr{p}; -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVCodecContext -//////////////////////////////////////////////////////////////////////////////// -void AVCodecContextDeleter::operator()(AVCodecContext* p) { - avcodec_free_context(&p); -}; - -AVCodecContextPtr::AVCodecContextPtr(AVCodecContext* p) - : Wrapper(p) {} - -//////////////////////////////////////////////////////////////////////////////// -// AVBufferRefPtr -//////////////////////////////////////////////////////////////////////////////// -void AutoBufferUnref::operator()(AVBufferRef* p) { - av_buffer_unref(&p); -} - -AVBufferRefPtr::AVBufferRefPtr(AVBufferRef* p) - : Wrapper(p) {} - -//////////////////////////////////////////////////////////////////////////////// -// AVFilterGraph -//////////////////////////////////////////////////////////////////////////////// -void AVFilterGraphDeleter::operator()(AVFilterGraph* p) { - avfilter_graph_free(&p); -}; - -AVFilterGraphPtr::AVFilterGraphPtr(AVFilterGraph* p) - : Wrapper(p) {} - -//////////////////////////////////////////////////////////////////////////////// -// AVCodecParameters -//////////////////////////////////////////////////////////////////////////////// -void AVCodecParametersDeleter::operator()(AVCodecParameters* codecpar) { - avcodec_parameters_free(&codecpar); -} - -AVCodecParametersPtr::AVCodecParametersPtr(AVCodecParameters* p) - : Wrapper(p) {} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/ffmpeg.h b/src/libtorio/ffmpeg/ffmpeg.h deleted file mode 100644 index 0a680a7d7d..0000000000 --- a/src/libtorio/ffmpeg/ffmpeg.h +++ /dev/null @@ -1,214 +0,0 @@ -// One stop header for all ffmepg needs -#pragma once -#include -#include -#include -#include -#include - -extern "C" { -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -} - -/// @cond - -namespace torio { -namespace io { - -using OptionDict = std::map; - -// https://github.com/FFmpeg/FFmpeg/blob/4e6debe1df7d53f3f59b37449b82265d5c08a172/doc/APIchanges#L252-L260 -// Starting from libavformat 59 (ffmpeg 5), -// AVInputFormat is const and related functions expect constant. -#if LIBAVFORMAT_VERSION_MAJOR >= 59 -#define AVFORMAT_CONST const -#else -#define AVFORMAT_CONST -#endif - -// Replacement of av_err2str, which causes -// `error: taking address of temporary array` -// https://github.com/joncampbell123/composite-video-simulator/issues/5 -av_always_inline std::string av_err2string(int errnum) { - char str[AV_ERROR_MAX_STRING_SIZE]; - return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum); -} - -// Base structure that handles memory management. -// Resource is freed by the destructor of unique_ptr, -// which will call custom delete mechanism provided via Deleter -// https://stackoverflow.com/a/19054280 -// -// The resource allocation will be provided by custom constructors. -template -class Wrapper { - std::unique_ptr ptr; - - public: - Wrapper() = delete; - explicit Wrapper(T* t) : ptr(t) {} - T* operator->() const { - return ptr.get(); - } - explicit operator bool() const { - return (bool)ptr; - } - operator T*() const { - return ptr.get(); - } -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVDictionary -//////////////////////////////////////////////////////////////////////////////// -// Since AVDictionaries are relocated by FFmpeg APIs it does not suit to -// IIRC-semantic. Instead we provide helper functions. - -// Convert standard dict to FFmpeg native type -AVDictionary* get_option_dict(const std::optional& option); - -// Clean up the dict after use. If there is an unsed key, throw runtime error -void clean_up_dict(AVDictionary* p); - -//////////////////////////////////////////////////////////////////////////////// -// AVFormatContext -//////////////////////////////////////////////////////////////////////////////// -struct AVFormatInputContextDeleter { - void operator()(AVFormatContext* p); -}; - -struct AVFormatInputContextPtr - : public Wrapper { - explicit AVFormatInputContextPtr(AVFormatContext* p); -}; - -struct AVFormatOutputContextDeleter { - void operator()(AVFormatContext* p); -}; - -struct AVFormatOutputContextPtr - : public Wrapper { - explicit AVFormatOutputContextPtr(AVFormatContext* p); -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVIO -//////////////////////////////////////////////////////////////////////////////// -struct AVIOContextDeleter { - void operator()(AVIOContext* p); -}; - -struct AVIOContextPtr : public Wrapper { - explicit AVIOContextPtr(AVIOContext* p); -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVPacket -//////////////////////////////////////////////////////////////////////////////// -struct AVPacketDeleter { - void operator()(AVPacket* p); -}; - -struct AVPacketPtr : public Wrapper { - explicit AVPacketPtr(AVPacket* p); -}; - -AVPacketPtr alloc_avpacket(); - -//////////////////////////////////////////////////////////////////////////////// -// AVPacket - buffer unref -//////////////////////////////////////////////////////////////////////////////// -// AVPacket structure employs two-staged memory allocation. -// The first-stage is for allocating AVPacket object itself, and it typically -// happens only once throughout the lifetime of application. -// The second-stage is for allocating the content (media data) each time the -// input file is processed and a chunk of data is read. The memory allocated -// during this time has to be released before the next iteration. -// The first-stage memory management is handled by `AVPacketPtr`. -// `AutoPacketUnref` handles the second-stage memory management. -struct AutoPacketUnref { - AVPacketPtr& p_; - explicit AutoPacketUnref(AVPacketPtr& p); - ~AutoPacketUnref(); - operator AVPacket*() const; -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVFrame -//////////////////////////////////////////////////////////////////////////////// -struct AVFrameDeleter { - void operator()(AVFrame* p); -}; - -struct AVFramePtr : public Wrapper { - explicit AVFramePtr(AVFrame* p); -}; - -AVFramePtr alloc_avframe(); - -//////////////////////////////////////////////////////////////////////////////// -// AutoBufferUnrer is responsible for performing unref at the end of lifetime -// of AVBufferRefPtr. -//////////////////////////////////////////////////////////////////////////////// -struct AutoBufferUnref { - void operator()(AVBufferRef* p); -}; - -struct AVBufferRefPtr : public Wrapper { - explicit AVBufferRefPtr(AVBufferRef* p); -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVCodecContext -//////////////////////////////////////////////////////////////////////////////// -struct AVCodecContextDeleter { - void operator()(AVCodecContext* p); -}; -struct AVCodecContextPtr - : public Wrapper { - explicit AVCodecContextPtr(AVCodecContext* p); -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVFilterGraph -//////////////////////////////////////////////////////////////////////////////// -struct AVFilterGraphDeleter { - void operator()(AVFilterGraph* p); -}; -struct AVFilterGraphPtr : public Wrapper { - explicit AVFilterGraphPtr(AVFilterGraph* p); -}; - -//////////////////////////////////////////////////////////////////////////////// -// AVCodecParameters -//////////////////////////////////////////////////////////////////////////////// -struct AVCodecParametersDeleter { - void operator()(AVCodecParameters* p); -}; - -struct AVCodecParametersPtr - : public Wrapper { - explicit AVCodecParametersPtr(AVCodecParameters* p); -}; - -struct StreamParams { - AVCodecParametersPtr codec_params{nullptr}; - AVRational time_base{}; - int stream_index{}; -}; -} // namespace io -} // namespace torio - -/// @endcond diff --git a/src/libtorio/ffmpeg/filter_graph.cpp b/src/libtorio/ffmpeg/filter_graph.cpp deleted file mode 100644 index 350ccabdbe..0000000000 --- a/src/libtorio/ffmpeg/filter_graph.cpp +++ /dev/null @@ -1,241 +0,0 @@ -#include - -namespace torio::io { - -namespace { -AVFilterGraph* get_filter_graph() { - AVFilterGraph* ptr = avfilter_graph_alloc(); - TORCH_CHECK(ptr, "Failed to allocate resouce."); - ptr->nb_threads = 1; - return ptr; -} -} // namespace - -FilterGraph::FilterGraph() : graph(get_filter_graph()) {} - -//////////////////////////////////////////////////////////////////////////////// -// Configuration methods -//////////////////////////////////////////////////////////////////////////////// -namespace { -std::string get_audio_src_args( - AVSampleFormat format, - AVRational time_base, - int sample_rate, - uint64_t channel_layout) { - char args[512]; - std::snprintf( - args, - sizeof(args), - "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64, - time_base.num, - time_base.den, - sample_rate, - av_get_sample_fmt_name(format), - channel_layout); - return std::string(args); -} - -std::string get_video_src_args( - AVPixelFormat format, - AVRational time_base, - AVRational frame_rate, - int width, - int height, - AVRational sample_aspect_ratio) { - char args[512]; - std::snprintf( - args, - sizeof(args), - "video_size=%dx%d:pix_fmt=%s:time_base=%d/%d:frame_rate=%d/%d:pixel_aspect=%d/%d", - width, - height, - av_get_pix_fmt_name(format), - time_base.num, - time_base.den, - frame_rate.num, - frame_rate.den, - sample_aspect_ratio.num, - sample_aspect_ratio.den); - return std::string(args); -} - -} // namespace - -void FilterGraph::add_audio_src( - AVSampleFormat format, - AVRational time_base, - int sample_rate, - uint64_t channel_layout) { - add_src( - avfilter_get_by_name("abuffer"), - get_audio_src_args(format, time_base, sample_rate, channel_layout)); -} - -void FilterGraph::add_video_src( - AVPixelFormat format, - AVRational time_base, - AVRational frame_rate, - int width, - int height, - AVRational sample_aspect_ratio) { - add_src( - avfilter_get_by_name("buffer"), - get_video_src_args( - format, time_base, frame_rate, width, height, sample_aspect_ratio)); -} - -void FilterGraph::add_src(const AVFilter* buffersrc, const std::string& args) { - int ret = avfilter_graph_create_filter( - &buffersrc_ctx, buffersrc, "in", args.c_str(), nullptr, graph); - TORCH_CHECK( - ret >= 0, - "Failed to create input filter: \"" + args + "\" (" + av_err2string(ret) + - ")"); -} - -void FilterGraph::add_audio_sink() { - add_sink(avfilter_get_by_name("abuffersink")); -} - -void FilterGraph::add_video_sink() { - add_sink(avfilter_get_by_name("buffersink")); -} - -void FilterGraph::add_sink(const AVFilter* buffersink) { - TORCH_CHECK(!buffersink_ctx, "Sink buffer is already allocated."); - // Note - // Originally, the code here followed the example - // https://ffmpeg.org/doxygen/4.1/filtering_audio_8c-example.html - // which sets option for `abuffersink`, which caused an issue where the - // `abuffersink` parameters set for the first time survive across multiple - // fitler generations. - // According to the other example - // https://ffmpeg.org/doxygen/4.1/filter_audio_8c-example.html - // `abuffersink` should not take options, and this resolved issue. - int ret = avfilter_graph_create_filter( - &buffersink_ctx, buffersink, "out", nullptr, nullptr, graph); - TORCH_CHECK(ret >= 0, "Failed to create output filter."); -} - -namespace { - -// Encapsulating AVFilterInOut* with handy methods since -// we need to deal with multiple of them at the same time. -class InOuts { - AVFilterInOut* p = nullptr; - // Disable copy constructor/assignment just in case. - InOuts(const InOuts&) = delete; - InOuts& operator=(const InOuts&) = delete; - - public: - InOuts(const char* name, AVFilterContext* pCtx) { - p = avfilter_inout_alloc(); - TORCH_CHECK(p, "Failed to allocate AVFilterInOut."); - p->name = av_strdup(name); - p->filter_ctx = pCtx; - p->pad_idx = 0; - p->next = nullptr; - } - ~InOuts() { - avfilter_inout_free(&p); - } - operator AVFilterInOut**() { - return &p; - } -}; - -} // namespace - -void FilterGraph::add_process(const std::string& filter_description) { - // Note - // The official example and other derived codes out there use - // https://ffmpeg.org/doxygen/4.1/filtering_audio_8c-example.html#_a37 - // variable name `in` for "out"/buffersink, and `out` for "in"/buffersrc. - // If you are debugging this part of the code, you might get confused. - InOuts in{"in", buffersrc_ctx}, out{"out", buffersink_ctx}; - - int ret = avfilter_graph_parse_ptr( - graph, filter_description.c_str(), out, in, nullptr); - - TORCH_CHECK( - ret >= 0, - "Failed to create the filter from \"" + filter_description + "\" (" + - av_err2string(ret) + ".)"); -} - -void FilterGraph::create_filter(AVBufferRef* hw_frames_ctx) { - buffersrc_ctx->outputs[0]->hw_frames_ctx = hw_frames_ctx; - int ret = avfilter_graph_config(graph, nullptr); - TORCH_CHECK(ret >= 0, "Failed to configure the graph: " + av_err2string(ret)); - // char* desc = avfilter_graph_dump(graph, NULL); - // std::cerr << "Filter created:\n" << desc << std::endl; - // av_free(static_cast(desc)); -} - -////////////////////////////////////////////////////////////////////////////// -// Query methods -////////////////////////////////////////////////////////////////////////////// -FilterGraphOutputInfo FilterGraph::get_output_info() const { - TORCH_INTERNAL_ASSERT(buffersink_ctx, "FilterGraph is not initialized."); - AVFilterLink* l = buffersink_ctx->inputs[0]; - FilterGraphOutputInfo ret{}; - ret.type = l->type; - ret.format = l->format; - ret.time_base = l->time_base; - switch (l->type) { - case AVMEDIA_TYPE_AUDIO: { - ret.sample_rate = l->sample_rate; -#if LIBAVFILTER_VERSION_MAJOR >= 8 && LIBAVFILTER_VERSION_MINOR >= 44 - ret.num_channels = l->ch_layout.nb_channels; -#else - // Before FFmpeg 5.1 - ret.num_channels = av_get_channel_layout_nb_channels(l->channel_layout); -#endif - break; - } - case AVMEDIA_TYPE_VIDEO: { - // If this is CUDA, retrieve the software pixel format from HW frames - // context. - if (l->format == AV_PIX_FMT_CUDA) { - // Originally, we were expecting that filter graph would propagate the - // HW frames context, so that we can retrieve it from the sink link. - // However, this is sometimes not the case. - // We do not know what is causing this behavior (GPU? libavfilter? - // format?) we resort to the source link in such case. - // - // (Technically, filters like scale_cuda could change the pixel format. - // We expect that hw_frames_ctx is propagated in such cases, but we do - // not know. - // TODO: check how scale_cuda interferes. - auto frames_ctx = [&]() -> AVHWFramesContext* { - if (l->hw_frames_ctx) { - return (AVHWFramesContext*)(l->hw_frames_ctx->data); - } - return (AVHWFramesContext*)(buffersrc_ctx->outputs[0] - ->hw_frames_ctx->data); - }(); - ret.format = frames_ctx->sw_format; - } - ret.frame_rate = l->frame_rate; - ret.height = l->h; - ret.width = l->w; - break; - } - default:; - } - return ret; -} - -//////////////////////////////////////////////////////////////////////////////// -// Streaming process -////////////////////////////////////////////////////////////////////////////// -int FilterGraph::add_frame(AVFrame* pInputFrame) { - return av_buffersrc_add_frame_flags( - buffersrc_ctx, pInputFrame, AV_BUFFERSRC_FLAG_KEEP_REF); -} - -int FilterGraph::get_frame(AVFrame* pOutputFrame) { - return av_buffersink_get_frame(buffersink_ctx, pOutputFrame); -} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/filter_graph.h b/src/libtorio/ffmpeg/filter_graph.h deleted file mode 100644 index 2495c2d240..0000000000 --- a/src/libtorio/ffmpeg/filter_graph.h +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once - -#include -namespace torio { -namespace io { - -/// Used to report the output formats of filter graph. -struct FilterGraphOutputInfo { - AVMediaType type = AVMEDIA_TYPE_UNKNOWN; - int format = -1; - - AVRational time_base = {1, 1}; - - // Audio - int sample_rate = -1; - int num_channels = -1; - - // Video - AVRational frame_rate = {0, 1}; - int height = -1; - int width = -1; -}; - -class FilterGraph { - AVFilterGraphPtr graph; - - // AVFilterContext is freed as a part of AVFilterGraph - // so we do not manage the resource. - AVFilterContext* buffersrc_ctx = nullptr; - AVFilterContext* buffersink_ctx = nullptr; - - public: - explicit FilterGraph(); - // Custom destructor to release AVFilterGraph* - ~FilterGraph() = default; - // Non-copyable - FilterGraph(const FilterGraph&) = delete; - FilterGraph& operator=(const FilterGraph&) = delete; - // Movable - FilterGraph(FilterGraph&&) = default; - FilterGraph& operator=(FilterGraph&&) = default; - - ////////////////////////////////////////////////////////////////////////////// - // Configuration methods - ////////////////////////////////////////////////////////////////////////////// - void add_audio_src( - AVSampleFormat format, - AVRational time_base, - int sample_rate, - uint64_t channel_layout); - - void add_video_src( - AVPixelFormat format, - AVRational time_base, - AVRational frame_rate, - int width, - int height, - AVRational sample_aspect_ratio); - - void add_audio_sink(); - - void add_video_sink(); - - void add_process(const std::string& filter_description); - - void create_filter(AVBufferRef* hw_frames_ctx = nullptr); - - private: - void add_src(const AVFilter* buffersrc, const std::string& arg); - - void add_sink(const AVFilter* buffersrc); - - ////////////////////////////////////////////////////////////////////////////// - // Query methods - ////////////////////////////////////////////////////////////////////////////// - public: - [[nodiscard]] FilterGraphOutputInfo get_output_info() const; - - ////////////////////////////////////////////////////////////////////////////// - // Streaming process - ////////////////////////////////////////////////////////////////////////////// - public: - int add_frame(AVFrame* pInputFrame); - int get_frame(AVFrame* pOutputFrame); -}; - -} // namespace io -} // namespace torio diff --git a/src/libtorio/ffmpeg/hw_context.cpp b/src/libtorio/ffmpeg/hw_context.cpp deleted file mode 100644 index 2bca656507..0000000000 --- a/src/libtorio/ffmpeg/hw_context.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include - -namespace torio::io { -namespace { - -static std::mutex MUTEX; -static std::map CUDA_CONTEXT_CACHE; - -} // namespace - -AVBufferRef* get_cuda_context(int index) { - std::lock_guard lock(MUTEX); - if (index == -1) { - index = 0; - } - if (CUDA_CONTEXT_CACHE.count(index) == 0) { - AVBufferRef* p = nullptr; - int ret = av_hwdevice_ctx_create( - &p, AV_HWDEVICE_TYPE_CUDA, std::to_string(index).c_str(), nullptr, 0); - TORCH_CHECK( - ret >= 0, - "Failed to create CUDA device context on device ", - index, - "(", - av_err2string(ret), - ")"); - assert(p); - CUDA_CONTEXT_CACHE.emplace(index, p); - return p; - } - AVBufferRefPtr& buffer = CUDA_CONTEXT_CACHE.at(index); - return buffer; -} - -void clear_cuda_context_cache() { - std::lock_guard lock(MUTEX); - CUDA_CONTEXT_CACHE.clear(); -} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/hw_context.h b/src/libtorio/ffmpeg/hw_context.h deleted file mode 100644 index cc58b651b0..0000000000 --- a/src/libtorio/ffmpeg/hw_context.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include - -namespace torio::io { - -AVBufferRef* get_cuda_context(int index); - -void clear_cuda_context_cache(); - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/pybind/pybind.cpp b/src/libtorio/ffmpeg/pybind/pybind.cpp deleted file mode 100644 index 3f954a2afc..0000000000 --- a/src/libtorio/ffmpeg/pybind/pybind.cpp +++ /dev/null @@ -1,469 +0,0 @@ -#include -#include -#include -#include - -namespace torio::io { -namespace { - -std::map> get_versions() { - std::map> ret; - -#define add_version(NAME) \ - { \ - int ver = NAME##_version(); \ - ret.emplace( \ - "lib" #NAME, \ - std::make_tuple<>( \ - AV_VERSION_MAJOR(ver), \ - AV_VERSION_MINOR(ver), \ - AV_VERSION_MICRO(ver))); \ - } - - add_version(avutil); - add_version(avcodec); - add_version(avformat); - add_version(avfilter); - add_version(avdevice); - return ret; - -#undef add_version -} - -std::map get_demuxers(bool req_device) { - std::map ret; - const AVInputFormat* fmt = nullptr; - void* i = nullptr; - while ((fmt = av_demuxer_iterate(&i))) { - assert(fmt); - bool is_device = [&]() { - const AVClass* avclass = fmt->priv_class; - return avclass && AV_IS_INPUT_DEVICE(avclass->category); - }(); - if (req_device == is_device) { - ret.emplace(fmt->name, fmt->long_name); - } - } - return ret; -} - -std::map get_muxers(bool req_device) { - std::map ret; - const AVOutputFormat* fmt = nullptr; - void* i = nullptr; - while ((fmt = av_muxer_iterate(&i))) { - assert(fmt); - bool is_device = [&]() { - const AVClass* avclass = fmt->priv_class; - return avclass && AV_IS_OUTPUT_DEVICE(avclass->category); - }(); - if (req_device == is_device) { - ret.emplace(fmt->name, fmt->long_name); - } - } - return ret; -} - -std::map get_codecs( - AVMediaType type, - bool req_encoder) { - const AVCodec* c = nullptr; - void* i = nullptr; - std::map ret; - while ((c = av_codec_iterate(&i))) { - assert(c); - if ((req_encoder && av_codec_is_encoder(c)) || - (!req_encoder && av_codec_is_decoder(c))) { - if (c->type == type && c->name) { - ret.emplace(c->name, c->long_name ? c->long_name : ""); - } - } - } - return ret; -} - -std::vector get_protocols(bool output) { - void* opaque = nullptr; - const char* name = nullptr; - std::vector ret; - while ((name = avio_enum_protocols(&opaque, output))) { - assert(name); - ret.emplace_back(name); - } - return ret; -} - -std::string get_build_config() { - return avcodec_configuration(); -} - -////////////////////////////////////////////////////////////////////////////// -// StreamingMediaDecoder/Encoder FileObj -////////////////////////////////////////////////////////////////////////////// - -struct FileObj { - py::object fileobj; - int buffer_size; -}; - -namespace { - -static int read_func(void* opaque, uint8_t* buf, int buf_size) { - FileObj* fileobj = static_cast(opaque); - buf_size = FFMIN(buf_size, fileobj->buffer_size); - - int num_read = 0; - while (num_read < buf_size) { - int request = buf_size - num_read; - auto chunk = static_cast( - static_cast(fileobj->fileobj.attr("read")(request))); - auto chunk_len = chunk.length(); - if (chunk_len == 0) { - break; - } - TORCH_CHECK( - chunk_len <= request, - "Requested up to ", - request, - " bytes but, received ", - chunk_len, - " bytes. The given object does not confirm to read protocol of file object."); - memcpy(buf, chunk.data(), chunk_len); - buf += chunk_len; - num_read += static_cast(chunk_len); - } - return num_read == 0 ? AVERROR_EOF : num_read; -} - -static int write_func(void* opaque, uint8_t* buf, int buf_size) { - FileObj* fileobj = static_cast(opaque); - buf_size = FFMIN(buf_size, fileobj->buffer_size); - - py::bytes b(reinterpret_cast(buf), buf_size); - // TODO: check the return value - fileobj->fileobj.attr("write")(b); - return buf_size; -} - -static int64_t seek_func(void* opaque, int64_t offset, int whence) { - // We do not know the file size. - if (whence == AVSEEK_SIZE) { - return AVERROR(EIO); - } - FileObj* fileobj = static_cast(opaque); - return py::cast(fileobj->fileobj.attr("seek")(offset, whence)); -} - -} // namespace - -struct StreamingMediaDecoderFileObj : private FileObj, - public StreamingMediaDecoderCustomIO { - StreamingMediaDecoderFileObj( - py::object fileobj, - const std::optional& format, - const std::optional>& option, - int buffer_size) - : FileObj{fileobj, buffer_size}, - StreamingMediaDecoderCustomIO( - this, - format, - buffer_size, - read_func, - py::hasattr(fileobj, "seek") ? &seek_func : nullptr, - option) {} -}; - -struct StreamingMediaEncoderFileObj : private FileObj, - public StreamingMediaEncoderCustomIO { - StreamingMediaEncoderFileObj( - py::object fileobj, - const std::optional& format, - int buffer_size) - : FileObj{fileobj, buffer_size}, - StreamingMediaEncoderCustomIO( - this, - format, - buffer_size, - write_func, - py::hasattr(fileobj, "seek") ? &seek_func : nullptr) {} -}; - -////////////////////////////////////////////////////////////////////////////// -// StreamingMediaDecoder/Encoder Bytes -////////////////////////////////////////////////////////////////////////////// -struct BytesWrapper { - std::string_view src; - size_t index = 0; -}; - -static int read_bytes(void* opaque, uint8_t* buf, int buf_size) { - BytesWrapper* wrapper = static_cast(opaque); - - auto num_read = FFMIN(wrapper->src.size() - wrapper->index, buf_size); - if (num_read == 0) { - return AVERROR_EOF; - } - auto head = wrapper->src.data() + wrapper->index; - memcpy(buf, head, num_read); - wrapper->index += num_read; - return num_read; -} - -static int64_t seek_bytes(void* opaque, int64_t offset, int whence) { - BytesWrapper* wrapper = static_cast(opaque); - if (whence == AVSEEK_SIZE) { - return wrapper->src.size(); - } - - if (whence == SEEK_SET) { - wrapper->index = offset; - } else if (whence == SEEK_CUR) { - wrapper->index += offset; - } else if (whence == SEEK_END) { - wrapper->index = wrapper->src.size() + offset; - } else { - TORCH_INTERNAL_ASSERT(false, "Unexpected whence value: ", whence); - } - return static_cast(wrapper->index); -} - -struct StreamingMediaDecoderBytes : private BytesWrapper, - public StreamingMediaDecoderCustomIO { - StreamingMediaDecoderBytes( - std::string_view src, - const std::optional& format, - const std::optional>& option, - int64_t buffer_size) - : BytesWrapper{src}, - StreamingMediaDecoderCustomIO( - this, - format, - buffer_size, - read_bytes, - seek_bytes, - option) {} -}; - -#ifndef TORIO_FFMPEG_EXT_NAME -#error TORIO_FFMPEG_EXT_NAME must be defined. -#endif - -PYBIND11_MODULE(TORIO_FFMPEG_EXT_NAME, m) { - m.def("init", []() { avdevice_register_all(); }); - m.def("get_log_level", []() { return av_log_get_level(); }); - m.def("set_log_level", [](int level) { av_log_set_level(level); }); - m.def("get_versions", &get_versions); - m.def("get_muxers", []() { return get_muxers(false); }); - m.def("get_demuxers", []() { return get_demuxers(false); }); - m.def("get_input_devices", []() { return get_demuxers(true); }); - m.def("get_build_config", &get_build_config); - m.def("get_output_devices", []() { return get_muxers(true); }); - m.def("get_audio_decoders", []() { - return get_codecs(AVMEDIA_TYPE_AUDIO, false); - }); - m.def("get_audio_encoders", []() { - return get_codecs(AVMEDIA_TYPE_AUDIO, true); - }); - m.def("get_video_decoders", []() { - return get_codecs(AVMEDIA_TYPE_VIDEO, false); - }); - m.def("get_video_encoders", []() { - return get_codecs(AVMEDIA_TYPE_VIDEO, true); - }); - m.def("get_input_protocols", []() { return get_protocols(false); }); - m.def("get_output_protocols", []() { return get_protocols(true); }); - m.def("clear_cuda_context_cache", &clear_cuda_context_cache); - - py::class_(m, "Chunk", py::module_local()) - .def_readwrite("frames", &Chunk::frames) - .def_readwrite("pts", &Chunk::pts); - py::class_(m, "CodecConfig", py::module_local()) - .def(py::init&, int, int>()); - py::class_( - m, "StreamingMediaEncoder", py::module_local()) - .def(py::init&>()) - .def("set_metadata", &StreamingMediaEncoder::set_metadata) - .def("add_audio_stream", &StreamingMediaEncoder::add_audio_stream) - .def("add_video_stream", &StreamingMediaEncoder::add_video_stream) - .def("dump_format", &StreamingMediaEncoder::dump_format) - .def("open", &StreamingMediaEncoder::open) - .def("write_audio_chunk", &StreamingMediaEncoder::write_audio_chunk) - .def("write_video_chunk", &StreamingMediaEncoder::write_video_chunk) - .def("flush", &StreamingMediaEncoder::flush) - .def("close", &StreamingMediaEncoder::close); - py::class_( - m, "StreamingMediaEncoderFileObj", py::module_local()) - .def(py::init&, int64_t>()) - .def("set_metadata", &StreamingMediaEncoderFileObj::set_metadata) - .def("add_audio_stream", &StreamingMediaEncoderFileObj::add_audio_stream) - .def("add_video_stream", &StreamingMediaEncoderFileObj::add_video_stream) - .def("dump_format", &StreamingMediaEncoderFileObj::dump_format) - .def("open", &StreamingMediaEncoderFileObj::open) - .def( - "write_audio_chunk", &StreamingMediaEncoderFileObj::write_audio_chunk) - .def( - "write_video_chunk", &StreamingMediaEncoderFileObj::write_video_chunk) - .def("flush", &StreamingMediaEncoderFileObj::flush) - .def("close", &StreamingMediaEncoderFileObj::close); - py::class_(m, "OutputStreamInfo", py::module_local()) - .def_readonly("source_index", &OutputStreamInfo::source_index) - .def_readonly("filter_description", &OutputStreamInfo::filter_description) - .def_property_readonly( - "media_type", - [](const OutputStreamInfo& o) -> std::string { - return av_get_media_type_string(o.media_type); - }) - .def_property_readonly( - "format", - [](const OutputStreamInfo& o) -> std::string { - switch (o.media_type) { - case AVMEDIA_TYPE_AUDIO: - return av_get_sample_fmt_name((AVSampleFormat)(o.format)); - case AVMEDIA_TYPE_VIDEO: - return av_get_pix_fmt_name((AVPixelFormat)(o.format)); - default: - TORCH_INTERNAL_ASSERT( - false, - "FilterGraph is returning unexpected media type: ", - av_get_media_type_string(o.media_type)); - } - }) - .def_readonly("sample_rate", &OutputStreamInfo::sample_rate) - .def_readonly("num_channels", &OutputStreamInfo::num_channels) - .def_readonly("width", &OutputStreamInfo::width) - .def_readonly("height", &OutputStreamInfo::height) - .def_property_readonly( - "frame_rate", [](const OutputStreamInfo& o) -> double { - if (o.frame_rate.den == 0) { - TORCH_WARN( - "Invalid frame rate is found: ", - o.frame_rate.num, - "/", - o.frame_rate.den); - return -1; - } - return static_cast(o.frame_rate.num) / o.frame_rate.den; - }); - py::class_(m, "SourceStreamInfo", py::module_local()) - .def_property_readonly( - "media_type", - [](const SrcStreamInfo& s) { - return av_get_media_type_string(s.media_type); - }) - .def_readonly("codec_name", &SrcStreamInfo::codec_name) - .def_readonly("codec_long_name", &SrcStreamInfo::codec_long_name) - .def_readonly("format", &SrcStreamInfo::fmt_name) - .def_readonly("bit_rate", &SrcStreamInfo::bit_rate) - .def_readonly("num_frames", &SrcStreamInfo::num_frames) - .def_readonly("bits_per_sample", &SrcStreamInfo::bits_per_sample) - .def_readonly("metadata", &SrcStreamInfo::metadata) - .def_readonly("sample_rate", &SrcStreamInfo::sample_rate) - .def_readonly("num_channels", &SrcStreamInfo::num_channels) - .def_readonly("width", &SrcStreamInfo::width) - .def_readonly("height", &SrcStreamInfo::height) - .def_readonly("frame_rate", &SrcStreamInfo::frame_rate); - py::class_( - m, "StreamingMediaDecoder", py::module_local()) - .def(py::init< - const std::string&, - const std::optional&, - const std::optional&>()) - .def("num_src_streams", &StreamingMediaDecoder::num_src_streams) - .def("num_out_streams", &StreamingMediaDecoder::num_out_streams) - .def( - "find_best_audio_stream", - &StreamingMediaDecoder::find_best_audio_stream) - .def( - "find_best_video_stream", - &StreamingMediaDecoder::find_best_video_stream) - .def("get_metadata", &StreamingMediaDecoder::get_metadata) - .def("get_src_stream_info", &StreamingMediaDecoder::get_src_stream_info) - .def("get_out_stream_info", &StreamingMediaDecoder::get_out_stream_info) - .def("seek", &StreamingMediaDecoder::seek) - .def("add_audio_stream", &StreamingMediaDecoder::add_audio_stream) - .def("add_video_stream", &StreamingMediaDecoder::add_video_stream) - .def("remove_stream", &StreamingMediaDecoder::remove_stream) - .def( - "process_packet", - py::overload_cast&, const double>( - &StreamingMediaDecoder::process_packet)) - .def("process_all_packets", &StreamingMediaDecoder::process_all_packets) - .def("fill_buffer", &StreamingMediaDecoder::fill_buffer) - .def("is_buffer_ready", &StreamingMediaDecoder::is_buffer_ready) - .def("pop_chunks", &StreamingMediaDecoder::pop_chunks); - py::class_( - m, "StreamingMediaDecoderFileObj", py::module_local()) - .def(py::init< - py::object, - const std::optional&, - const std::optional&, - int64_t>()) - .def("num_src_streams", &StreamingMediaDecoderFileObj::num_src_streams) - .def("num_out_streams", &StreamingMediaDecoderFileObj::num_out_streams) - .def( - "find_best_audio_stream", - &StreamingMediaDecoderFileObj::find_best_audio_stream) - .def( - "find_best_video_stream", - &StreamingMediaDecoderFileObj::find_best_video_stream) - .def("get_metadata", &StreamingMediaDecoderFileObj::get_metadata) - .def( - "get_src_stream_info", - &StreamingMediaDecoderFileObj::get_src_stream_info) - .def( - "get_out_stream_info", - &StreamingMediaDecoderFileObj::get_out_stream_info) - .def("seek", &StreamingMediaDecoderFileObj::seek) - .def("add_audio_stream", &StreamingMediaDecoderFileObj::add_audio_stream) - .def("add_video_stream", &StreamingMediaDecoderFileObj::add_video_stream) - .def("remove_stream", &StreamingMediaDecoderFileObj::remove_stream) - .def( - "process_packet", - py::overload_cast&, const double>( - &StreamingMediaDecoder::process_packet)) - .def( - "process_all_packets", - &StreamingMediaDecoderFileObj::process_all_packets) - .def("fill_buffer", &StreamingMediaDecoderFileObj::fill_buffer) - .def("is_buffer_ready", &StreamingMediaDecoderFileObj::is_buffer_ready) - .def("pop_chunks", &StreamingMediaDecoderFileObj::pop_chunks); - py::class_( - m, "StreamingMediaDecoderBytes", py::module_local()) - .def(py::init< - std::string_view, - const std::optional&, - const std::optional&, - int64_t>()) - .def("num_src_streams", &StreamingMediaDecoderBytes::num_src_streams) - .def("num_out_streams", &StreamingMediaDecoderBytes::num_out_streams) - .def( - "find_best_audio_stream", - &StreamingMediaDecoderBytes::find_best_audio_stream) - .def( - "find_best_video_stream", - &StreamingMediaDecoderBytes::find_best_video_stream) - .def("get_metadata", &StreamingMediaDecoderBytes::get_metadata) - .def( - "get_src_stream_info", - &StreamingMediaDecoderBytes::get_src_stream_info) - .def( - "get_out_stream_info", - &StreamingMediaDecoderBytes::get_out_stream_info) - .def("seek", &StreamingMediaDecoderBytes::seek) - .def("add_audio_stream", &StreamingMediaDecoderBytes::add_audio_stream) - .def("add_video_stream", &StreamingMediaDecoderBytes::add_video_stream) - .def("remove_stream", &StreamingMediaDecoderBytes::remove_stream) - .def( - "process_packet", - py::overload_cast&, const double>( - &StreamingMediaDecoder::process_packet)) - .def( - "process_all_packets", - &StreamingMediaDecoderBytes::process_all_packets) - .def("fill_buffer", &StreamingMediaDecoderBytes::fill_buffer) - .def("is_buffer_ready", &StreamingMediaDecoderBytes::is_buffer_ready) - .def("pop_chunks", &StreamingMediaDecoderBytes::pop_chunks); -} - -} // namespace -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.cpp b/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.cpp deleted file mode 100644 index 4965ea43ab..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.cpp +++ /dev/null @@ -1,129 +0,0 @@ -#include - -namespace torio::io::detail { - -ChunkedBuffer::ChunkedBuffer( - AVRational time_base, - int frames_per_chunk_, - int num_chunks_) - : time_base(time_base), - frames_per_chunk(frames_per_chunk_), - num_chunks(num_chunks_){}; - -bool ChunkedBuffer::is_ready() const { - return num_buffered_frames >= frames_per_chunk; -} - -void ChunkedBuffer::push_frame(torch::Tensor frame, int64_t pts_) { - using namespace torch::indexing; - // Note: - // Audio tensors contain multiple frames while video tensors contain only - // one frame. Video tensors can be regarded as special degenerated case of - // audio, so in the following, we only consider audio processing. - // - // The incoming Tensor might contain more frames than the value of - // `frames_per_chunk`. - // If we push the input tensor to dequeu as-is, then, at the trimming stage, - // the entire frames would be trimmed, this is not ideal. We want to keep - // at most `frames_per_chunk * num_chunks` frames. - // So we slice push the incoming Tensor. - // - - // 1. Check if the last chunk is fully filled. If not, fill it. - // - // <----- frames per chunk ----->^ - // x x x x x x x x x x x x x x x | - // x x x x x x x + + + + + + - - | num_chunks - // - - - - - - - - - - - - - - - | - // <-- filled --><--- remain --->v - // <- append-> - // - if (int64_t filled = num_buffered_frames % frames_per_chunk) { - TORCH_INTERNAL_ASSERT( - chunks.size() > 0, - "There is supposed to be left over frames, but the buffer dequeue is empty."); - int64_t num_frames = frame.size(0); - int64_t remain = frames_per_chunk - filled; - int64_t append = remain < num_frames ? remain : num_frames; - - torch::Tensor prev = chunks.back(); - // prev[filled:filled+append] = frame[:append] - prev.index_put_( - {Slice(filled, filled + append)}, frame.index({Slice(None, append)})); - num_buffered_frames += append; - // frame = frame[append:] - frame = frame.index({Slice(append)}); - pts_ += append; - } - - // 2. Return if the number of input frames are smaller than the empty buffer. - // i.e. all the frames are pushed. - if (frame.numel() == 0) { - return; - } - - // 3. Now the existing buffer chunks are fully filled, start adding new chunks - // - // <----- frames per chunk ----->^ - // x x x x x x x x x x x x x x x | - // x x x x x x x x x x x x x x x | num_chunks - // + + + + + + + + + + + + + + + | - // <---------- append ---------->v - // - int64_t num_frames = frame.size(0); - int64_t num_splits = - num_frames / frames_per_chunk + (num_frames % frames_per_chunk ? 1 : 0); - for (int64_t i = 0; i < num_splits; ++i) { - int64_t start = i * frames_per_chunk; - // chunk = frame[i*frames_per_chunk:(i+1) * frames_per_chunk] - auto chunk = frame.index({Slice(start, start + frames_per_chunk)}); - int64_t pts_val = pts_ + start; - int64_t chunk_size = chunk.size(0); - TORCH_INTERNAL_ASSERT( - chunk_size <= frames_per_chunk, - "Chunk size is larger than frames per chunk."); - if (chunk_size < frames_per_chunk) { - auto shape = chunk.sizes().vec(); - shape[0] = frames_per_chunk; - auto temp = torch::empty(shape, frame.options()); - temp.index_put_({Slice(None, chunk_size)}, chunk); - chunk = temp; - } - chunks.push_back(chunk); - pts.push_back(pts_val); - num_buffered_frames += chunk_size; - - // Trim if num_chunks > 0 - if (num_chunks > 0 && chunks.size() > num_chunks) { - TORCH_WARN_ONCE( - "The number of buffered frames exceeded the buffer size. " - "Dropping the old frames. " - "To avoid this, you can set a higher buffer_chunk_size value."); - chunks.pop_front(); - num_buffered_frames -= frames_per_chunk; - } - } -} - -std::optional ChunkedBuffer::pop_chunk() { - using namespace torch::indexing; - if (!num_buffered_frames) { - return {}; - } - torch::Tensor chunk = chunks.front(); - double pts_val = double(pts.front()) * time_base.num / time_base.den; - chunks.pop_front(); - pts.pop_front(); - if (num_buffered_frames < frames_per_chunk) { - chunk = chunk.index({Slice(None, num_buffered_frames)}); - } - num_buffered_frames -= chunk.size(0); - return {Chunk{chunk, pts_val}}; -} - -void ChunkedBuffer::flush() { - num_buffered_frames = 0; - chunks.clear(); -} - -} // namespace torio::io::detail diff --git a/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.h b/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.h deleted file mode 100644 index a667c003e2..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/buffer/chunked_buffer.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once -#include -#include - -namespace torio::io::detail { - -class ChunkedBuffer { - // Each AVFrame is converted to a Tensor and stored here. - std::deque chunks; - // Time stamps corresponding the first frame of each chunk - std::deque pts; - AVRational time_base; - - // The number of frames to return as a chunk - // If <0, then user wants to receive all the frames - const int64_t frames_per_chunk; - // The numbe of chunks to retain - const int64_t num_chunks; - // The number of currently stored chunks - // For video, one Tensor corresponds to one frame, but for audio, - // one Tensor contains multiple samples, so we track here. - int64_t num_buffered_frames = 0; - - public: - ChunkedBuffer(AVRational time_base, int frames_per_chunk, int num_chunks); - - bool is_ready() const; - void flush(); - std::optional pop_chunk(); - void push_frame(torch::Tensor frame, int64_t pts_); -}; - -} // namespace torio::io::detail diff --git a/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.cpp b/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.cpp deleted file mode 100644 index dbc19f2c01..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.cpp +++ /dev/null @@ -1,33 +0,0 @@ -#include - -namespace torio::io::detail { - -UnchunkedBuffer::UnchunkedBuffer(AVRational time_base) : time_base(time_base){}; - -bool UnchunkedBuffer::is_ready() const { - return chunks.size() > 0; -} - -void UnchunkedBuffer::push_frame(torch::Tensor frame, int64_t pts_) { - if (chunks.size() == 0) { - pts = double(pts_) * time_base.num / time_base.den; - } - chunks.push_back(frame); -} - -std::optional UnchunkedBuffer::pop_chunk() { - if (chunks.size() == 0) { - return {}; - } - - auto frames = - torch::cat(std::vector{chunks.begin(), chunks.end()}, 0); - chunks.clear(); - return {Chunk{frames, pts}}; -} - -void UnchunkedBuffer::flush() { - chunks.clear(); -} - -} // namespace torio::io::detail diff --git a/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h b/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h deleted file mode 100644 index 461afec89b..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/buffer/unchunked_buffer.h +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once -#include -#include -#include -#include - -namespace torio::io::detail { - -class UnchunkedBuffer { - // Each AVFrame is converted to a Tensor and stored here. - std::deque chunks; - double pts = -1.; - AVRational time_base; - - public: - explicit UnchunkedBuffer(AVRational time_base); - bool is_ready() const; - void push_frame(torch::Tensor frame, int64_t pts_); - std::optional pop_chunk(); - void flush(); -}; - -} // namespace torio::io::detail diff --git a/src/libtorio/ffmpeg/stream_reader/conversion.cpp b/src/libtorio/ffmpeg/stream_reader/conversion.cpp deleted file mode 100644 index c762bc3f57..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/conversion.cpp +++ /dev/null @@ -1,630 +0,0 @@ -#include -#include - -#ifdef USE_CUDA -#include -#endif - -namespace torio::io { - -//////////////////////////////////////////////////////////////////////////////// -// Audio -//////////////////////////////////////////////////////////////////////////////// - -template -AudioConverter::AudioConverter(int c) : num_channels(c) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(num_channels > 0); -} - -template -torch::Tensor AudioConverter::convert(const AVFrame* src) { - if constexpr (is_planar) { - torch::Tensor dst = torch::empty({num_channels, src->nb_samples}, dtype); - convert(src, dst); - return dst.permute({1, 0}); - } else { - torch::Tensor dst = torch::empty({src->nb_samples, num_channels}, dtype); - convert(src, dst); - return dst; - } -} - -// Converts AVFrame* into pre-allocated Tensor. -// The shape must be [C, T] if is_planar otherwise [T, C] -template -void AudioConverter::convert( - const AVFrame* src, - torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(num_channels == src->channels); - - constexpr int bps = []() { - switch (dtype) { - case torch::kUInt8: - return 1; - case torch::kInt16: - return 2; - case torch::kInt32: - case torch::kFloat32: - return 4; - case torch::kInt64: - case torch::kFloat64: - return 8; - } - }(); - - // Note - // FFMpeg's `nb_samples` represnts the number of samples par channel. - // whereas, in torchaudio, `num_samples` is used to represent the number of - // samples across channels. torchaudio uses `num_frames` for per-channel - // samples. - if constexpr (is_planar) { - int plane_size = bps * src->nb_samples; - uint8_t* p_dst = static_cast(dst.data_ptr()); - for (int i = 0; i < num_channels; ++i) { - memcpy(p_dst, src->extended_data[i], plane_size); - p_dst += plane_size; - } - } else { - int plane_size = bps * src->nb_samples * num_channels; - memcpy(dst.data_ptr(), src->extended_data[0], plane_size); - } -} - -// Explicit instantiation -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; -template class AudioConverter; - -//////////////////////////////////////////////////////////////////////////////// -// Image -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -torch::Tensor get_image_buffer( - at::IntArrayRef shape, - const torch::Dtype dtype = torch::kUInt8) { - return torch::empty( - shape, torch::TensorOptions().dtype(dtype).layout(torch::kStrided)); -} - -#ifdef USE_CUDA -torch::Tensor get_image_buffer( - at::IntArrayRef shape, - torch::Device device, - const torch::Dtype dtype = torch::kUInt8) { - return torch::empty( - shape, - torch::TensorOptions() - .dtype(dtype) - .layout(torch::kStrided) - .device(device)); -} -#endif // USE_CUDA - -} // namespace - -ImageConverterBase::ImageConverterBase(int h, int w, int c) - : height(h), width(w), num_channels(c) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(height > 0); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(width > 0); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(num_channels > 0); -} - -//////////////////////////////////////////////////////////////////////////////// -// Interlaced Image -//////////////////////////////////////////////////////////////////////////////// -void InterlacedImageConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == height); - int stride = width * num_channels; - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) * dst.size(3) == stride); - auto p_dst = dst.data_ptr(); - uint8_t* p_src = src->data[0]; - for (int i = 0; i < height; ++i) { - memcpy(p_dst, p_src, stride); - p_src += src->linesize[0]; - p_dst += stride; - } -} - -torch::Tensor InterlacedImageConverter::convert(const AVFrame* src) { - torch::Tensor buffer = get_image_buffer({1, height, width, num_channels}); - convert(src, buffer); - return buffer.permute({0, 3, 1, 2}); -} - -//////////////////////////////////////////////////////////////////////////////// -// Interlaced 16 Bit Image -//////////////////////////////////////////////////////////////////////////////// -void Interlaced16BitImageConverter::convert( - const AVFrame* src, - torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == height); - int stride = width * num_channels; - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) * dst.size(3) == stride); - auto p_dst = dst.data_ptr(); - uint8_t* p_src = src->data[0]; - for (int i = 0; i < height; ++i) { - memcpy(p_dst, p_src, stride * 2); - p_src += src->linesize[0]; - p_dst += stride; - } - // correct for int16 - dst += 32768; -} - -torch::Tensor Interlaced16BitImageConverter::convert(const AVFrame* src) { - torch::Tensor buffer = - get_image_buffer({1, height, width, num_channels}, torch::kInt16); - convert(src, buffer); - return buffer.permute({0, 3, 1, 2}); -} - -//////////////////////////////////////////////////////////////////////////////// -// Planar Image -//////////////////////////////////////////////////////////////////////////////// -void PlanarImageConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == num_channels); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - - for (int i = 0; i < num_channels; ++i) { - torch::Tensor plane = dst.index({0, i}); - uint8_t* p_dst = plane.data_ptr(); - uint8_t* p_src = src->data[i]; - int linesize = src->linesize[i]; - for (int h = 0; h < height; ++h) { - memcpy(p_dst, p_src, width); - p_src += linesize; - p_dst += width; - } - } -} - -torch::Tensor PlanarImageConverter::convert(const AVFrame* src) { - torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}); - convert(src, buffer); - return buffer; -} - -//////////////////////////////////////////////////////////////////////////////// -// YUV420P -//////////////////////////////////////////////////////////////////////////////// -YUV420PConverter::YUV420PConverter(int h, int w) : ImageConverterBase(h, w, 3) { - TORCH_WARN_ONCE( - "The output format YUV420P is selected. " - "This will be implicitly converted to YUV444P, " - "in which all the color components Y, U, V have the same dimension."); -} - -void YUV420PConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - (AVPixelFormat)(src->format) == AV_PIX_FMT_YUV420P); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - - // Write Y plane directly - { - uint8_t* p_dst = dst.data_ptr(); - uint8_t* p_src = src->data[0]; - for (int h = 0; h < height; ++h) { - memcpy(p_dst, p_src, width); - p_dst += width; - p_src += src->linesize[0]; - } - } - // Chroma (U and V planes) are subsamapled by 2 in both vertical and - // holizontal directions. - // https://en.wikipedia.org/wiki/Chroma_subsampling - // Since we are returning data in Tensor, which has the same size for all - // color planes, we need to upsample the UV planes. PyTorch has interpolate - // function but it does not work for int16 type. So we manually copy them. - // - // block1 block2 block3 block4 - // ab -> aabb = a b * a b * * - // cd aabb a b a b - // ccdd c d c d - // ccdd c d c d - // - auto block00 = dst.slice(2, 0, {}, 2).slice(3, 0, {}, 2); - auto block01 = dst.slice(2, 0, {}, 2).slice(3, 1, {}, 2); - auto block10 = dst.slice(2, 1, {}, 2).slice(3, 0, {}, 2); - auto block11 = dst.slice(2, 1, {}, 2).slice(3, 1, {}, 2); - for (int i = 1; i < 3; ++i) { - // borrow data - auto tmp = torch::from_blob( - src->data[i], - {height / 2, width / 2}, - {src->linesize[i], 1}, - [](void*) {}, - torch::TensorOptions().dtype(torch::kUInt8).layout(torch::kStrided)); - // Copy to each block - block00.slice(1, i, i + 1).copy_(tmp); - block01.slice(1, i, i + 1).copy_(tmp); - block10.slice(1, i, i + 1).copy_(tmp); - block11.slice(1, i, i + 1).copy_(tmp); - } -} - -torch::Tensor YUV420PConverter::convert(const AVFrame* src) { - torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}); - convert(src, buffer); - return buffer; -} - -//////////////////////////////////////////////////////////////////////////////// -// YUV420P10LE -//////////////////////////////////////////////////////////////////////////////// -YUV420P10LEConverter::YUV420P10LEConverter(int h, int w) - : ImageConverterBase(h, w, 3) { - TORCH_WARN_ONCE( - "The output format YUV420PLE is selected. " - "This will be implicitly converted to YUV444P (16-bit), " - "in which all the color components Y, U, V have the same dimension."); -} - -void YUV420P10LEConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - (AVPixelFormat)(src->format) == AV_PIX_FMT_YUV420P10LE); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kInt16); - - // Write Y plane directly - { - int16_t* p_dst = dst.data_ptr(); - uint8_t* p_src = src->data[0]; - for (int h = 0; h < height; ++h) { - memcpy(p_dst, p_src, (size_t)width * 2); - p_dst += width; - p_src += src->linesize[0]; - } - } - // Chroma (U and V planes) are subsamapled by 2 in both vertical and - // holizontal directions. - // https://en.wikipedia.org/wiki/Chroma_subsampling - // Since we are returning data in Tensor, which has the same size for all - // color planes, we need to upsample the UV planes. PyTorch has interpolate - // function but it does not work for int16 type. So we manually copy them. - // - // block1 block2 block3 block4 - // ab -> aabb = a b * a b * * - // cd aabb a b a b - // ccdd c d c d - // ccdd c d c d - // - auto block00 = dst.slice(2, 0, {}, 2).slice(3, 0, {}, 2); - auto block01 = dst.slice(2, 0, {}, 2).slice(3, 1, {}, 2); - auto block10 = dst.slice(2, 1, {}, 2).slice(3, 0, {}, 2); - auto block11 = dst.slice(2, 1, {}, 2).slice(3, 1, {}, 2); - for (int i = 1; i < 3; ++i) { - // borrow data - auto tmp = torch::from_blob( - src->data[i], - {height / 2, width / 2}, - {src->linesize[i] / 2, 1}, - [](void*) {}, - torch::TensorOptions().dtype(torch::kInt16).layout(torch::kStrided)); - // Copy to each block - block00.slice(1, i, i + 1).copy_(tmp); - block01.slice(1, i, i + 1).copy_(tmp); - block10.slice(1, i, i + 1).copy_(tmp); - block11.slice(1, i, i + 1).copy_(tmp); - } -} - -torch::Tensor YUV420P10LEConverter::convert(const AVFrame* src) { - torch::Tensor buffer = - get_image_buffer({1, num_channels, height, width}, torch::kInt16); - convert(src, buffer); - return buffer; -} - -//////////////////////////////////////////////////////////////////////////////// -// NV12 -//////////////////////////////////////////////////////////////////////////////// -NV12Converter::NV12Converter(int h, int w) : ImageConverterBase(h, w, 3) { - TORCH_WARN_ONCE( - "The output format NV12 is selected. " - "This will be implicitly converted to YUV444P, " - "in which all the color components Y, U, V have the same dimension."); -} - -void NV12Converter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - (AVPixelFormat)(src->format) == AV_PIX_FMT_NV12); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - - // Write Y plane directly - { - uint8_t* p_dst = dst.data_ptr(); - uint8_t* p_src = src->data[0]; - for (int h = 0; h < height; ++h) { - memcpy(p_dst, p_src, width); - p_dst += width; - p_src += src->linesize[0]; - } - } - // Write intermediate UV plane - { - auto tmp = torch::from_blob( - src->data[1], - {height / 2, width}, - {src->linesize[1], 1}, - [](void*) {}, - torch::TensorOptions().dtype(torch::kUInt8).layout(torch::kStrided)); - tmp = tmp.view({1, height / 2, width / 2, 2}).permute({0, 3, 1, 2}); - auto dst_uv = dst.slice(1, 1, 3); - dst_uv.slice(2, 0, {}, 2).slice(3, 0, {}, 2).copy_(tmp); - dst_uv.slice(2, 0, {}, 2).slice(3, 1, {}, 2).copy_(tmp); - dst_uv.slice(2, 1, {}, 2).slice(3, 0, {}, 2).copy_(tmp); - dst_uv.slice(2, 1, {}, 2).slice(3, 1, {}, 2).copy_(tmp); - } -} - -torch::Tensor NV12Converter::convert(const AVFrame* src) { - torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}); - convert(src, buffer); - return buffer; -} - -#ifdef USE_CUDA - -CudaImageConverterBase::CudaImageConverterBase(const torch::Device& device) - : device(device) {} - -//////////////////////////////////////////////////////////////////////////////// -// NV12 CUDA -//////////////////////////////////////////////////////////////////////////////// -NV12CudaConverter::NV12CudaConverter(const torch::Device& device) - : CudaImageConverterBase(device) { - TORCH_WARN_ONCE( - "The output format NV12 is selected. " - "This will be implicitly converted to YUV444P, " - "in which all the color components Y, U, V have the same dimension."); -} - -void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kUInt8); - - auto fmt = (AVPixelFormat)(src->format); - AVHWFramesContext* hwctx = (AVHWFramesContext*)src->hw_frames_ctx->data; - AVPixelFormat sw_fmt = hwctx->sw_format; - - TORCH_INTERNAL_ASSERT( - AV_PIX_FMT_CUDA == fmt, - "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); - TORCH_INTERNAL_ASSERT( - AV_PIX_FMT_NV12 == sw_fmt, - "Expected NV12 format. Found: ", - av_get_pix_fmt_name(sw_fmt)); - - // Write Y plane directly - auto status = cudaMemcpy2D( - dst.data_ptr(), - width, - src->data[0], - src->linesize[0], - width, - height, - cudaMemcpyDeviceToDevice); - TORCH_CHECK(cudaSuccess == status, "Failed to copy Y plane to Cuda tensor."); - // Preapare intermediate UV planes - status = cudaMemcpy2D( - tmp_uv.data_ptr(), - width, - src->data[1], - src->linesize[1], - width, - height / 2, - cudaMemcpyDeviceToDevice); - TORCH_CHECK(cudaSuccess == status, "Failed to copy UV plane to Cuda tensor."); - // Upsample width and height - namespace F = torch::nn::functional; - torch::Tensor uv = F::interpolate( - tmp_uv.permute({0, 3, 1, 2}), - F::InterpolateFuncOptions() - .mode(torch::kNearest) - .size(std::vector({height, width}))); - // Write to the UV plane - // dst[:, 1:] = uv - using namespace torch::indexing; - dst.index_put_({Slice(), Slice(1)}, uv); -} - -torch::Tensor NV12CudaConverter::convert(const AVFrame* src) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - if (!init) { - height = src->height; - width = src->width; - tmp_uv = - get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kUInt8); - init = true; - } - - torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device); - convert(src, buffer); - return buffer; -} - -//////////////////////////////////////////////////////////////////////////////// -// P010 CUDA -//////////////////////////////////////////////////////////////////////////////// -P010CudaConverter::P010CudaConverter(const torch::Device& device) - : CudaImageConverterBase{device} { - TORCH_WARN_ONCE( - "The output format P010 is selected. " - "This will be implicitly converted to YUV444P, " - "in which all the color components Y, U, V have the same dimension."); -} - -void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kInt16); - - auto fmt = (AVPixelFormat)(src->format); - AVHWFramesContext* hwctx = (AVHWFramesContext*)src->hw_frames_ctx->data; - AVPixelFormat sw_fmt = hwctx->sw_format; - - TORCH_INTERNAL_ASSERT( - AV_PIX_FMT_CUDA == fmt, - "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); - TORCH_INTERNAL_ASSERT( - AV_PIX_FMT_P010 == sw_fmt, - "Expected P010 format. Found: ", - av_get_pix_fmt_name(sw_fmt)); - - // Write Y plane directly - auto status = cudaMemcpy2D( - dst.data_ptr(), - width * 2, - src->data[0], - src->linesize[0], - width * 2, - height, - cudaMemcpyDeviceToDevice); - TORCH_CHECK(cudaSuccess == status, "Failed to copy Y plane to CUDA tensor."); - // Prepare intermediate UV planes - status = cudaMemcpy2D( - tmp_uv.data_ptr(), - width * 2, - src->data[1], - src->linesize[1], - width * 2, - height / 2, - cudaMemcpyDeviceToDevice); - TORCH_CHECK(cudaSuccess == status, "Failed to copy UV plane to CUDA tensor."); - // Write to the UV plane - torch::Tensor uv = tmp_uv.permute({0, 3, 1, 2}); - using namespace torch::indexing; - // very simplistic upscale using indexing since interpolate doesn't support - // shorts - dst.index_put_( - {Slice(), Slice(1, 3), Slice(None, None, 2), Slice(None, None, 2)}, uv); - dst.index_put_( - {Slice(), Slice(1, 3), Slice(1, None, 2), Slice(None, None, 2)}, uv); - dst.index_put_( - {Slice(), Slice(1, 3), Slice(None, None, 2), Slice(1, None, 2)}, uv); - dst.index_put_( - {Slice(), Slice(1, 3), Slice(1, None, 2), Slice(1, None, 2)}, uv); - // correct for int16 - dst += 32768; -} - -torch::Tensor P010CudaConverter::convert(const AVFrame* src) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - if (!init) { - height = src->height; - width = src->width; - tmp_uv = - get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kInt16); - init = true; - } - - torch::Tensor buffer = - get_image_buffer({1, 3, height, width}, device, torch::kInt16); - convert(src, buffer); - return buffer; -} - -//////////////////////////////////////////////////////////////////////////////// -// YUV444P CUDA -//////////////////////////////////////////////////////////////////////////////// -YUV444PCudaConverter::YUV444PCudaConverter(const torch::Device& device) - : CudaImageConverterBase(device) {} - -void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kUInt8); - - auto fmt = (AVPixelFormat)(src->format); - AVHWFramesContext* hwctx = (AVHWFramesContext*)src->hw_frames_ctx->data; - AVPixelFormat sw_fmt = hwctx->sw_format; - - TORCH_INTERNAL_ASSERT( - AV_PIX_FMT_CUDA == fmt, - "Expected CUDA frame. Found: ", - av_get_pix_fmt_name(fmt)); - TORCH_INTERNAL_ASSERT( - AV_PIX_FMT_YUV444P == sw_fmt, - "Expected YUV444P format. Found: ", - av_get_pix_fmt_name(sw_fmt)); - - // Write Y plane directly - for (int i = 0; i < 3; ++i) { - auto status = cudaMemcpy2D( - dst.index({0, i}).data_ptr(), - width, - src->data[i], - src->linesize[i], - width, - height, - cudaMemcpyDeviceToDevice); - TORCH_CHECK( - cudaSuccess == status, "Failed to copy plane ", i, " to CUDA tensor."); - } -} - -torch::Tensor YUV444PCudaConverter::convert(const AVFrame* src) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); - if (!init) { - height = src->height; - width = src->width; - init = true; - } - torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device); - convert(src, buffer); - return buffer; -} - -#endif - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/conversion.h b/src/libtorio/ffmpeg/stream_reader/conversion.h deleted file mode 100644 index ed01d8f6d8..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/conversion.h +++ /dev/null @@ -1,129 +0,0 @@ -#pragma once -#include -#include - -namespace torio::io { - -//////////////////////////////////////////////////////////////////////////////// -// Audio -//////////////////////////////////////////////////////////////////////////////// -template -class AudioConverter { - const int num_channels; - - public: - explicit AudioConverter(int num_channels); - - // Converts AVFrame* into Tensor of [T, C] - torch::Tensor convert(const AVFrame* src); - - // Converts AVFrame* into pre-allocated Tensor. - // The shape must be [C, T] if is_planar otherwise [T, C] - void convert(const AVFrame* src, torch::Tensor& dst); -}; - -//////////////////////////////////////////////////////////////////////////////// -// Image -//////////////////////////////////////////////////////////////////////////////// -struct ImageConverterBase { - const int height; - const int width; - const int num_channels; - - ImageConverterBase(int h, int w, int c); -}; - -//////////////////////////////////////////////////////////////////////////////// -// Interlaced Images - NHWC -//////////////////////////////////////////////////////////////////////////////// -struct InterlacedImageConverter : public ImageConverterBase { - using ImageConverterBase::ImageConverterBase; - // convert AVFrame* into Tensor of NCHW format - torch::Tensor convert(const AVFrame* src); - // convert AVFrame* into pre-allocated Tensor of NHWC format - void convert(const AVFrame* src, torch::Tensor& dst); -}; - -struct Interlaced16BitImageConverter : public ImageConverterBase { - using ImageConverterBase::ImageConverterBase; - // convert AVFrame* into Tensor of NCHW format - torch::Tensor convert(const AVFrame* src); - // convert AVFrame* into pre-allocated Tensor of NHWC format - void convert(const AVFrame* src, torch::Tensor& dst); -}; - -//////////////////////////////////////////////////////////////////////////////// -// Planar Images - NCHW -//////////////////////////////////////////////////////////////////////////////// -struct PlanarImageConverter : public ImageConverterBase { - using ImageConverterBase::ImageConverterBase; - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -//////////////////////////////////////////////////////////////////////////////// -// Family of YUVs - NCHW -//////////////////////////////////////////////////////////////////////////////// -class YUV420PConverter : public ImageConverterBase { - public: - YUV420PConverter(int height, int width); - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -class YUV420P10LEConverter : public ImageConverterBase { - public: - YUV420P10LEConverter(int height, int width); - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -class NV12Converter : public ImageConverterBase { - public: - NV12Converter(int height, int width); - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -#ifdef USE_CUDA - -// Note: -// GPU decoders are tricky. They allow to change the resolution as part of -// decoder option, and the resulting resolution is (seemingly) not retrievable. -// Therefore, we adopt delayed frame size initialization. -// For that purpose, we do not inherit from ImageConverterBase. -struct CudaImageConverterBase { - const torch::Device device; - bool init = false; - int height = -1; - int width = -1; - explicit CudaImageConverterBase(const torch::Device& device); -}; - -class NV12CudaConverter : CudaImageConverterBase { - torch::Tensor tmp_uv{}; - - public: - explicit NV12CudaConverter(const torch::Device& device); - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -class P010CudaConverter : CudaImageConverterBase { - torch::Tensor tmp_uv{}; - - public: - explicit P010CudaConverter(const torch::Device& device); - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -class YUV444PCudaConverter : CudaImageConverterBase { - public: - explicit YUV444PCudaConverter(const torch::Device& device); - void convert(const AVFrame* src, torch::Tensor& dst); - torch::Tensor convert(const AVFrame* src); -}; - -#endif -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/packet_buffer.cpp b/src/libtorio/ffmpeg/stream_reader/packet_buffer.cpp deleted file mode 100644 index 315c37191f..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/packet_buffer.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include - -namespace torio::io { -void PacketBuffer::push_packet(AVPacket* packet) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(packet, "Packet is null."); - AVPacket* p = av_packet_clone(packet); - TORCH_INTERNAL_ASSERT(p, "Failed to clone packet."); - packets.emplace_back(p); -} -std::vector PacketBuffer::pop_packets() { - std::vector ret{ - std::make_move_iterator(packets.begin()), - std::make_move_iterator(packets.end())}; - packets.clear(); - return ret; -} -bool PacketBuffer::has_packets() { - return packets.size() > 0; -} -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/packet_buffer.h b/src/libtorio/ffmpeg/stream_reader/packet_buffer.h deleted file mode 100644 index 49a823c541..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/packet_buffer.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once -#include - -namespace torio { -namespace io { -class PacketBuffer { - public: - void push_packet(AVPacket* packet); - std::vector pop_packets(); - bool has_packets(); - - private: - std::deque packets; -}; -} // namespace io -} // namespace torio diff --git a/src/libtorio/ffmpeg/stream_reader/post_process.cpp b/src/libtorio/ffmpeg/stream_reader/post_process.cpp deleted file mode 100644 index f2cd31fa2f..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/post_process.cpp +++ /dev/null @@ -1,620 +0,0 @@ -#include -#include -#include -#include - -namespace torio::io { -namespace detail { -namespace { - -/////////////////////////////////////////////////////////////////////////////// -// FilterGraphWrapper (FilterGraph + reset feature) -/////////////////////////////////////////////////////////////////////////////// -using FilterGraphFactory = std::function; - -FilterGraphFactory get_audio_factory( - AVRational time_base, - AVCodecContext* codec_ctx) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(codec_ctx->codec_type == AVMEDIA_TYPE_AUDIO); - return [fmt = codec_ctx->sample_fmt, - time_base, - rate = codec_ctx->sample_rate, - channel_layout = codec_ctx->channel_layout]( - const std::string& filter_desc) -> FilterGraph { - FilterGraph f; - f.add_audio_src(fmt, time_base, rate, channel_layout); - f.add_audio_sink(); - f.add_process(filter_desc); - f.create_filter(); - return f; - }; -} - -FilterGraphFactory get_video_factory( - AVRational time_base, - AVRational frame_rate, - AVCodecContext* codec_ctx) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(codec_ctx->codec_type == AVMEDIA_TYPE_VIDEO); - return [fmt = codec_ctx->pix_fmt, - time_base, - frame_rate, - w = codec_ctx->width, - h = codec_ctx->height, - ratio = codec_ctx->sample_aspect_ratio, - hw_frames_ctx = codec_ctx->hw_frames_ctx]( - const std::string& filter_desc) -> FilterGraph { - FilterGraph f; - f.add_video_src(fmt, time_base, frame_rate, w, h, ratio); - f.add_video_sink(); - f.add_process(filter_desc); - if (hw_frames_ctx) { - f.create_filter(av_buffer_ref(hw_frames_ctx)); - } else { - f.create_filter(); - } - return f; - }; -} - -struct FilterGraphWrapper { - const std::string desc; - - private: - FilterGraphFactory factory; - - public: - FilterGraph filter; - - // Constructor for audio input - FilterGraphWrapper( - AVRational input_time_base, - AVCodecContext* codec_ctx, - const std::string& desc) - : desc(desc), - factory(get_audio_factory(input_time_base, codec_ctx)), - filter(factory(desc)) {} - - // Constructor for video input - FilterGraphWrapper( - AVRational input_time_base, - AVRational frame_rate, - AVCodecContext* codec_ctx, - const std::string& desc) - : desc(desc), - factory(get_video_factory(input_time_base, frame_rate, codec_ctx)), - filter(factory(desc)) {} - - void reset() { - filter = factory(desc); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// ProcessImpl -/////////////////////////////////////////////////////////////////////////////// -template -struct ProcessImpl : public IPostDecodeProcess { - private: - AVFramePtr frame{alloc_avframe()}; - FilterGraphWrapper filter_wrapper; - - public: - Converter converter; - Buffer buffer; - - ProcessImpl( - FilterGraphWrapper&& filter_wrapper, - Converter&& converter, - Buffer&& buffer) - : filter_wrapper(std::move(filter_wrapper)), - converter(std::move(converter)), - buffer(std::move(buffer)) {} - - bool is_buffer_ready() const override { - return buffer.is_ready(); - } - - const std::string& get_filter_desc() const override { - return filter_wrapper.desc; - } - - FilterGraphOutputInfo get_filter_output_info() const override { - return filter_wrapper.filter.get_output_info(); - } - - void flush() override { - filter_wrapper.reset(); - buffer.flush(); - } - - int process_frame(AVFrame* in_frame) override { - int ret = filter_wrapper.filter.add_frame(in_frame); - while (ret >= 0) { - ret = filter_wrapper.filter.get_frame(frame); - // AVERROR(EAGAIN) means that new input data is required to return new - // output. - if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { - return 0; - } - if (ret >= 0) { - buffer.push_frame(converter.convert(frame), frame->pts); - } - av_frame_unref(frame); - } - return ret; - } - - std::optional pop_chunk() override { - return buffer.pop_chunk(); - } -}; - -/////////////////////////////////////////////////////////////////////////////// -// Audio -/////////////////////////////////////////////////////////////////////////////// -std::unique_ptr get_unchunked_audio_process( - FilterGraphWrapper&& filter) { - auto i = filter.filter.get_output_info(); - - TORCH_INTERNAL_ASSERT( - i.type == AVMEDIA_TYPE_AUDIO, - "Unsupported media type found: ", - av_get_media_type_string(i.type)); - - using B = UnchunkedBuffer; - - switch (auto fmt = (AVSampleFormat)i.format; fmt) { - case AV_SAMPLE_FMT_U8: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_S16: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_S32: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_S64: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_FLT: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_DBL: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_U8P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_S16P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_S32P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_S64P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_FLTP: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - case AV_SAMPLE_FMT_DBLP: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, B{i.time_base}); - } - default: - TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); - } -} - -std::unique_ptr get_chunked_audio_process( - FilterGraphWrapper&& filter, - int frames_per_chunk, - int num_chunks) { - auto i = filter.filter.get_output_info(); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - i.type == AVMEDIA_TYPE_AUDIO, - "Unsupported media type found: ", - av_get_media_type_string(i.type)); - - using B = ChunkedBuffer; - B buffer{i.time_base, frames_per_chunk, num_chunks}; - - switch (auto fmt = (AVSampleFormat)i.format; fmt) { - case AV_SAMPLE_FMT_U8: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_S16: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_S32: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_S64: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_FLT: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_DBL: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_U8P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_S16P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_S32P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_S64P: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_FLTP: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - case AV_SAMPLE_FMT_DBLP: { - using C = AudioConverter; - return std::make_unique>( - std::move(filter), C{i.num_channels}, std::move(buffer)); - } - default: - TORCH_INTERNAL_ASSERT( - false, "Unexpected audio type:", av_get_sample_fmt_name(fmt)); - } -} - -/////////////////////////////////////////////////////////////////////////////// -// Video -/////////////////////////////////////////////////////////////////////////////// -std::unique_ptr get_unchunked_video_process( - FilterGraphWrapper&& filter) { - auto i = filter.filter.get_output_info(); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - i.type == AVMEDIA_TYPE_VIDEO, - "Unsupported media type found: ", - av_get_media_type_string(i.type)); - - auto h = i.height; - auto w = i.width; - auto tb = i.time_base; - - using B = UnchunkedBuffer; - switch (auto fmt = (AVPixelFormat)i.format; fmt) { - case AV_PIX_FMT_RGB24: - case AV_PIX_FMT_BGR24: { - using C = InterlacedImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 3}, B{tb}); - } - case AV_PIX_FMT_ARGB: - case AV_PIX_FMT_RGBA: - case AV_PIX_FMT_ABGR: - case AV_PIX_FMT_BGRA: { - using C = InterlacedImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 4}, B{tb}); - } - case AV_PIX_FMT_GRAY8: { - using C = InterlacedImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 1}, B{tb}); - } - case AV_PIX_FMT_RGB48LE: { - using C = Interlaced16BitImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 3}, B{tb}); - } - case AV_PIX_FMT_YUV444P: { - using C = PlanarImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 3}, B{tb}); - } - case AV_PIX_FMT_YUV420P: { - using C = YUV420PConverter; - return std::make_unique>( - std::move(filter), C{h, w}, B{tb}); - } - case AV_PIX_FMT_YUV420P10LE: { - using C = YUV420P10LEConverter; - return std::make_unique>( - std::move(filter), C{h, w}, B{tb}); - } - case AV_PIX_FMT_NV12: { - using C = NV12Converter; - return std::make_unique>( - std::move(filter), C{h, w}, B{tb}); - } - default: { - TORCH_INTERNAL_ASSERT( - false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); - } - } -} - -std::unique_ptr get_unchunked_cuda_video_process( - FilterGraphWrapper&& filter, - const torch::Device& device) { -#ifndef USE_CUDA - TORCH_INTERNAL_ASSERT( - false, - "USE_CUDA is not defined, but CUDA decoding process was requested."); -#else - auto i = filter.filter.get_output_info(); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - i.type == AVMEDIA_TYPE_VIDEO, - "Unsupported media type found: ", - av_get_media_type_string(i.type)); - - using B = UnchunkedBuffer; - switch (auto fmt = (AVPixelFormat)i.format; fmt) { - case AV_PIX_FMT_NV12: { - using C = NV12CudaConverter; - return std::make_unique>( - std::move(filter), C{device}, B{i.time_base}); - } - case AV_PIX_FMT_P010: { - using C = P010CudaConverter; - return std::make_unique>( - std::move(filter), C{device}, B{i.time_base}); - } - case AV_PIX_FMT_YUV444P: { - using C = YUV444PCudaConverter; - return std::make_unique>( - std::move(filter), C{device}, B{i.time_base}); - } - case AV_PIX_FMT_P016: { - TORCH_CHECK( - false, - "Unsupported video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); - } - default: { - TORCH_CHECK( - false, - "Unexpected video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); - } - } -#endif -} - -std::unique_ptr get_chunked_video_process( - FilterGraphWrapper&& filter, - int frames_per_chunk, - int num_chunks) { - auto i = filter.filter.get_output_info(); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - i.type == AVMEDIA_TYPE_VIDEO, - "Unsupported media type found: ", - av_get_media_type_string(i.type)); - - auto h = i.height; - auto w = i.width; - auto tb = i.time_base; - - using B = ChunkedBuffer; - switch (auto fmt = (AVPixelFormat)i.format; fmt) { - case AV_PIX_FMT_RGB24: - case AV_PIX_FMT_BGR24: { - using C = InterlacedImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 3}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_ARGB: - case AV_PIX_FMT_RGBA: - case AV_PIX_FMT_ABGR: - case AV_PIX_FMT_BGRA: { - using C = InterlacedImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 4}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_GRAY8: { - using C = InterlacedImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 1}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_RGB48LE: { - using C = Interlaced16BitImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 3}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_YUV444P: { - using C = PlanarImageConverter; - return std::make_unique>( - std::move(filter), C{h, w, 3}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_YUV420P: { - using C = YUV420PConverter; - return std::make_unique>( - std::move(filter), C{h, w}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_YUV420P10LE: { - using C = YUV420P10LEConverter; - return std::make_unique>( - std::move(filter), C{h, w}, B{tb, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_NV12: { - using C = NV12Converter; - return std::make_unique>( - std::move(filter), C{h, w}, B{tb, frames_per_chunk, num_chunks}); - } - default: { - TORCH_INTERNAL_ASSERT( - false, "Unexpected video format found: ", av_get_pix_fmt_name(fmt)); - } - } -} - -std::unique_ptr get_chunked_cuda_video_process( - FilterGraphWrapper&& filter, - int frames_per_chunk, - int num_chunks, - const torch::Device& device) { -#ifndef USE_CUDA - TORCH_INTERNAL_ASSERT( - false, - "USE_CUDA is not defined, but CUDA decoding process was requested."); -#else - auto i = filter.filter.get_output_info(); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - i.type == AVMEDIA_TYPE_VIDEO, - "Unsupported media type found: ", - av_get_media_type_string(i.type)); - - using B = ChunkedBuffer; - switch (auto fmt = (AVPixelFormat)i.format; fmt) { - case AV_PIX_FMT_NV12: { - using C = NV12CudaConverter; - return std::make_unique>( - std::move(filter), - C{device}, - B{i.time_base, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_P010: { - using C = P010CudaConverter; - return std::make_unique>( - std::move(filter), - C{device}, - B{i.time_base, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_YUV444P: { - using C = YUV444PCudaConverter; - return std::make_unique>( - std::move(filter), - C{device}, - B{i.time_base, frames_per_chunk, num_chunks}); - } - case AV_PIX_FMT_P016: { - TORCH_CHECK( - false, - "Unsupported video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); - } - default: { - TORCH_CHECK( - false, - "Unexpected video format found in CUDA HW: ", - av_get_pix_fmt_name(fmt)); - } - } -#endif -} -} // namespace -} // namespace detail - -std::unique_ptr get_audio_process( - AVRational input_time_base, - AVCodecContext* codec_ctx, - const std::string& desc, - int frames_per_chunk, - int num_chunks) { - TORCH_CHECK( - frames_per_chunk > 0 || frames_per_chunk == -1, - "`frames_per_chunk` must be positive or -1. Found: ", - frames_per_chunk); - - TORCH_CHECK( - num_chunks > 0 || num_chunks == -1, - "`num_chunks` must be positive or -1. Found: ", - num_chunks); - - detail::FilterGraphWrapper filter{input_time_base, codec_ctx, desc}; - - if (frames_per_chunk == -1) { - return detail::get_unchunked_audio_process(std::move(filter)); - } - return detail::get_chunked_audio_process( - std::move(filter), frames_per_chunk, num_chunks); -} - -std::unique_ptr get_video_process( - AVRational input_time_base, - AVRational frame_rate, - AVCodecContext* codec_ctx, - const std::string& desc, - int frames_per_chunk, - int num_chunks, - const torch::Device& device) { - TORCH_CHECK( - frames_per_chunk > 0 || frames_per_chunk == -1, - "`frames_per_chunk` must be positive or -1. Found: ", - frames_per_chunk); - - TORCH_CHECK( - num_chunks > 0 || num_chunks == -1, - "`num_chunks` must be positive or -1. Found: ", - num_chunks); - - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - device.is_cuda() || device.is_cpu(), "Unexpected device type: ", device); - - detail::FilterGraphWrapper filter{ - input_time_base, frame_rate, codec_ctx, desc}; - - if (frames_per_chunk == -1) { - if (device.is_cuda()) { - return detail::get_unchunked_cuda_video_process( - std::move(filter), device); - } - return detail::get_unchunked_video_process(std::move(filter)); - } - if (device.is_cuda()) { - return detail::get_chunked_cuda_video_process( - std::move(filter), frames_per_chunk, num_chunks, device); - } - return detail::get_chunked_video_process( - std::move(filter), frames_per_chunk, num_chunks); -} -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/post_process.h b/src/libtorio/ffmpeg/stream_reader/post_process.h deleted file mode 100644 index c5dea5fdc1..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/post_process.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once -#include -#include - -namespace torio::io { - -struct IPostDecodeProcess { - virtual ~IPostDecodeProcess() = default; - - virtual int process_frame(AVFrame* frame) = 0; - virtual std::optional pop_chunk() = 0; - virtual bool is_buffer_ready() const = 0; - virtual const std::string& get_filter_desc() const = 0; - virtual FilterGraphOutputInfo get_filter_output_info() const = 0; - virtual void flush() = 0; -}; - -std::unique_ptr get_audio_process( - AVRational input_time_base, - AVCodecContext* codec_ctx, - const std::string& desc, - int frames_per_chunk, - int num_chunks); - -std::unique_ptr get_video_process( - AVRational input_time_base, - AVRational frame_rate, - AVCodecContext* codec_ctx, - const std::string& desc, - int frames_per_chunk, - int num_chunks, - const torch::Device& device); - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/stream_processor.cpp b/src/libtorio/ffmpeg/stream_reader/stream_processor.cpp deleted file mode 100644 index b3d9a783b0..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/stream_processor.cpp +++ /dev/null @@ -1,396 +0,0 @@ -#include -#include -#include - -namespace torio::io { - -namespace { -AVCodecContextPtr alloc_codec_context( - enum AVCodecID codec_id, - const std::optional& decoder_name) { - const AVCodec* codec = [&]() { - if (decoder_name) { - const AVCodec* c = - avcodec_find_decoder_by_name(decoder_name.value().c_str()); - TORCH_CHECK(c, "Unsupported codec: ", decoder_name.value()); - return c; - } else { - const AVCodec* c = avcodec_find_decoder(codec_id); - TORCH_CHECK(c, "Unsupported codec: ", avcodec_get_name(codec_id)); - return c; - } - }(); - - AVCodecContext* codec_ctx = avcodec_alloc_context3(codec); - TORCH_CHECK(codec_ctx, "Failed to allocate CodecContext."); - return AVCodecContextPtr(codec_ctx); -} - -#ifdef USE_CUDA -const AVCodecHWConfig* get_cuda_config(const AVCodec* codec) { - for (int i = 0;; ++i) { - const AVCodecHWConfig* config = avcodec_get_hw_config(codec, i); - if (!config) { - break; - } - if (config->device_type == AV_HWDEVICE_TYPE_CUDA && - config->methods & AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX) { - return config; - } - } - TORCH_CHECK( - false, - "CUDA device was requested, but the codec \"", - codec->name, - "\" is not supported."); -} - -enum AVPixelFormat get_hw_format( - AVCodecContext* codec_ctx, - const enum AVPixelFormat* pix_fmts) { - const AVCodecHWConfig* cfg = static_cast(codec_ctx->opaque); - for (const enum AVPixelFormat* p = pix_fmts; *p != -1; p++) { - if (*p == cfg->pix_fmt) { - // Note - // The HW decode example uses generic approach - // https://ffmpeg.org/doxygen/4.1/hw__decode_8c_source.html#l00063 - // But this approach finalizes the codec configuration when the first - // frame comes in. - // We need to inspect the codec configuration right after the codec is - // opened. - // So we add short cut for known patterns. - // yuv420p (h264) -> nv12 - // yuv420p10le (hevc/h265) -> p010le - switch (codec_ctx->pix_fmt) { - case AV_PIX_FMT_YUV420P: { - codec_ctx->pix_fmt = AV_PIX_FMT_CUDA; - codec_ctx->sw_pix_fmt = AV_PIX_FMT_NV12; - break; - } - case AV_PIX_FMT_YUV420P10LE: { - codec_ctx->pix_fmt = AV_PIX_FMT_CUDA; - codec_ctx->sw_pix_fmt = AV_PIX_FMT_P010LE; - break; - } - default:; - } - return *p; - } - } - TORCH_WARN("Failed to get HW surface format."); - return AV_PIX_FMT_NONE; -} -#endif // USE_CUDA - -AVBufferRef* get_hw_frames_ctx(AVCodecContext* codec_ctx) { - AVBufferRef* p = av_hwframe_ctx_alloc(codec_ctx->hw_device_ctx); - TORCH_CHECK( - p, - "Failed to allocate CUDA frame context from device context at ", - codec_ctx->hw_device_ctx); - auto frames_ctx = (AVHWFramesContext*)(p->data); - frames_ctx->format = codec_ctx->pix_fmt; - frames_ctx->sw_format = codec_ctx->sw_pix_fmt; - frames_ctx->width = codec_ctx->width; - frames_ctx->height = codec_ctx->height; - frames_ctx->initial_pool_size = 5; - int ret = av_hwframe_ctx_init(p); - if (ret >= 0) { - return p; - } - av_buffer_unref(&p); - TORCH_CHECK( - false, "Failed to initialize CUDA frame context: ", av_err2string(ret)); -} - -void configure_codec_context( - AVCodecContext* codec_ctx, - const AVCodecParameters* params, - const torch::Device& device) { - int ret = avcodec_parameters_to_context(codec_ctx, params); - TORCH_CHECK( - ret >= 0, "Failed to set CodecContext parameter: ", av_err2string(ret)); - - if (device.type() == c10::DeviceType::CUDA) { -#ifndef USE_CUDA - TORCH_CHECK(false, "torchaudio is not compiled with CUDA support."); -#else - const AVCodecHWConfig* cfg = get_cuda_config(codec_ctx->codec); - // https://www.ffmpeg.org/doxygen/trunk/hw__decode_8c_source.html#l00221 - // 1. Set HW config to opaue pointer. - codec_ctx->opaque = static_cast(const_cast(cfg)); - // 2. Set pCodecContext->get_format call back function which - // will retrieve the HW pixel format from opaque pointer. - codec_ctx->get_format = get_hw_format; - codec_ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); - TORCH_INTERNAL_ASSERT( - codec_ctx->hw_device_ctx, "Failed to reference HW device context."); -#endif - } -} - -void open_codec( - AVCodecContext* codec_ctx, - const std::optional& decoder_option) { - AVDictionary* opts = get_option_dict(decoder_option); - - // Default to single thread execution. - if (!av_dict_get(opts, "threads", nullptr, 0)) { - av_dict_set(&opts, "threads", "1", 0); - } - - if (!codec_ctx->channel_layout) { - codec_ctx->channel_layout = - av_get_default_channel_layout(codec_ctx->channels); - } - - int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opts); - clean_up_dict(opts); - TORCH_CHECK( - ret >= 0, "Failed to initialize CodecContext: ", av_err2string(ret)); -} - -bool ends_with(std::string_view str, std::string_view suffix) { - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -AVCodecContextPtr get_codec_ctx( - const AVCodecParameters* params, - const std::optional& decoder_name, - const std::optional& decoder_option, - const torch::Device& device) { - AVCodecContextPtr codec_ctx = - alloc_codec_context(params->codec_id, decoder_name); - configure_codec_context(codec_ctx, params, device); - open_codec(codec_ctx, decoder_option); - if (codec_ctx->hw_device_ctx) { - codec_ctx->hw_frames_ctx = get_hw_frames_ctx(codec_ctx); - } - if (ends_with(codec_ctx->codec->name, "_cuvid")) { - C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamingMediaDecoderCUDA"); - } - return codec_ctx; -} - -} // namespace - -using KeyType = StreamProcessor::KeyType; - -StreamProcessor::StreamProcessor(const AVRational& time_base) - : stream_time_base(time_base) {} - -//////////////////////////////////////////////////////////////////////////////// -// Configurations -//////////////////////////////////////////////////////////////////////////////// -KeyType StreamProcessor::add_stream( - int frames_per_chunk, - int num_chunks, - AVRational frame_rate, - const std::string& filter_description, - const torch::Device& device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - is_decoder_set(), "Decoder hasn't been set."); - // If device is provided, then check that codec_ctx has hw_device_ctx set. - // In case, defining an output stream with HW accel on an input stream that - // has decoder set without HW accel, it will cause seg fault. - // i.e. - // The following should be rejected here. - // reader = StreamingMediaDecoder(...) - // reader.add_video_stream(..., decoder="h264_cuvid") - // reader.add_video_stream(..., decoder="h264_cuvid", hw_accel="cuda") - // TODO: - // One idea to work around this is to always define HW device context, and - // if HW acceleration is not required, insert `hwdownload` filter. - // This way it will be possible to handle both cases at the same time. - switch (device.type()) { - case torch::kCPU: - TORCH_CHECK( - !codec_ctx->hw_device_ctx, - "Decoding without Hardware acceleration is requested, however, " - "the decoder has been already defined with a HW acceleration. " - "Decoding a stream with and without HW acceleration simultaneously " - "is not supported."); - break; - case torch::kCUDA: - TORCH_CHECK( - codec_ctx->hw_device_ctx, - "CUDA Hardware acceleration is requested, however, the decoder has " - "been already defined without a HW acceleration. " - "Decoding a stream with and without HW acceleration simultaneously " - "is not supported."); - break; - default:; - } - - switch (codec_ctx->codec_type) { - case AVMEDIA_TYPE_AUDIO: - post_processes.emplace( - std::piecewise_construct, - std::forward_as_tuple(current_key), - std::forward_as_tuple(get_audio_process( - stream_time_base, - codec_ctx, - filter_description, - frames_per_chunk, - num_chunks))); - return current_key++; - case AVMEDIA_TYPE_VIDEO: - post_processes.emplace( - std::piecewise_construct, - std::forward_as_tuple(current_key), - std::forward_as_tuple(get_video_process( - stream_time_base, - frame_rate, - codec_ctx, - filter_description, - frames_per_chunk, - num_chunks, - device))); - return current_key++; - default: - TORCH_CHECK(false, "Only Audio and Video are supported"); - } -} - -void StreamProcessor::remove_stream(KeyType key) { - post_processes.erase(key); -} - -void StreamProcessor::set_discard_timestamp(int64_t timestamp) { - TORCH_CHECK(timestamp >= 0, "timestamp must be non-negative."); - discard_before_pts = - av_rescale_q(timestamp, av_get_time_base_q(), stream_time_base); -} - -void StreamProcessor::set_decoder( - const AVCodecParameters* codecpar, - const std::optional& decoder_name, - const std::optional& decoder_option, - const torch::Device& device) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!codec_ctx, "Decoder has already been set."); - codec_ctx = get_codec_ctx(codecpar, decoder_name, decoder_option, device); -} - -//////////////////////////////////////////////////////////////////////////////// -// Query methods -//////////////////////////////////////////////////////////////////////////////// -std::string StreamProcessor::get_filter_description(KeyType key) const { - return post_processes.at(key)->get_filter_desc(); -} - -FilterGraphOutputInfo StreamProcessor::get_filter_output_info( - KeyType key) const { - return post_processes.at(key)->get_filter_output_info(); -} - -bool StreamProcessor::is_buffer_ready() const { - for (const auto& it : post_processes) { - if (!it.second->is_buffer_ready()) { - return false; - } - } - return true; -} - -bool StreamProcessor::is_decoder_set() const { - return codec_ctx; -} - -//////////////////////////////////////////////////////////////////////////////// -// The streaming process -//////////////////////////////////////////////////////////////////////////////// -// 0: some kind of success -// <0: Some error happened -int StreamProcessor::process_packet(AVPacket* packet) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - is_decoder_set(), - "Decoder must have been set prior to calling this function."); - int ret = avcodec_send_packet(codec_ctx, packet); - while (ret >= 0) { - ret = avcodec_receive_frame(codec_ctx, frame); - // AVERROR(EAGAIN) means that new input data is required to return new - // output. - if (ret == AVERROR(EAGAIN)) { - return 0; - } - if (ret == AVERROR_EOF) { - return send_frame(nullptr); - } - if (ret < 0) { - return ret; - } - - // If pts is undefined then overwrite with best effort estimate. - // In this case, best_effort_timestamp is basically the number of frames - // emit from decoder. - // - // We need valid pts because filter_graph does not fall back to - // best_effort_timestamp. - if (frame->pts == AV_NOPTS_VALUE) { - if (frame->best_effort_timestamp == AV_NOPTS_VALUE) { - // This happens in drain mode. - // When the decoder enters drain mode, it starts flushing the internally - // buffered frames, of which PTS cannot be estimated. - // - // This is because they might be intra-frames not in chronological - // order. In this case, we use received frames as-is in the order they - // are received. - frame->pts = codec_ctx->frame_number + 1; - } else { - frame->pts = frame->best_effort_timestamp; - } - } - - // When the value of discard_before_pts is 0, we consider that the seek is - // not performed and all the frames are passed to downstream - // unconditionally. - // - // Two reasons for this behavior; - // 1. When seek mode is not precise, we do not discard any frame. - // In this case discard_before_pts is set to zero. - // 2. When users seek to zero, what they expect is to get to the beginning - // of the data. - // - // Note: discard_before_pts < 0 is UB. - if (discard_before_pts <= 0 || frame->pts >= discard_before_pts) { - send_frame(frame); - } - - // else we can just unref the frame and continue - av_frame_unref(frame); - } - return ret; -} - -void StreamProcessor::flush() { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - is_decoder_set(), - "Decoder must have been set prior to calling this function."); - avcodec_flush_buffers(codec_ctx); - for (auto& ite : post_processes) { - ite.second->flush(); - } -} - -// 0: some kind of success -// <0: Some error happened -int StreamProcessor::send_frame(AVFrame* frame_) { - int ret = 0; - for (auto& ite : post_processes) { - int ret2 = ite.second->process_frame(frame_); - if (ret2 < 0) { - ret = ret2; - } - } - return ret; -} - -//////////////////////////////////////////////////////////////////////////////// -// Retrieval -//////////////////////////////////////////////////////////////////////////////// -std::optional StreamProcessor::pop_chunk(KeyType key) { - return post_processes.at(key)->pop_chunk(); -} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/stream_processor.h b/src/libtorio/ffmpeg/stream_reader/stream_processor.h deleted file mode 100644 index 267c1159d4..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/stream_processor.h +++ /dev/null @@ -1,107 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace torio { -namespace io { - -class StreamProcessor { - public: - using KeyType = int; - - private: - // Stream time base which is not stored in AVCodecContextPtr - AVRational stream_time_base; - - // Components for decoding source media - AVCodecContextPtr codec_ctx{nullptr}; - AVFramePtr frame{alloc_avframe()}; - - KeyType current_key = 0; - std::map> post_processes; - - // Used for precise seek. - // 0: no discard - // Positive Values: decoded frames with PTS values less than this are - // discarded. - // Negative values: UB. Should not happen. - int64_t discard_before_pts = 0; - - public: - explicit StreamProcessor(const AVRational& time_base); - ~StreamProcessor() = default; - // Non-copyable - StreamProcessor(const StreamProcessor&) = delete; - StreamProcessor& operator=(const StreamProcessor&) = delete; - // Movable - StreamProcessor(StreamProcessor&&) = default; - StreamProcessor& operator=(StreamProcessor&&) = default; - - ////////////////////////////////////////////////////////////////////////////// - // Configurations - ////////////////////////////////////////////////////////////////////////////// - // 1. Initialize decoder (if not initialized yet) - // 2. Configure a new audio/video filter. - // If the custom parameter is provided, then perform resize, resample etc.. - // otherwise, the filter only converts the sample type. - // 3. Configure a buffer. - // 4. Return filter ID. - KeyType add_stream( - int frames_per_chunk, - int num_chunks, - AVRational frame_rate, - const std::string& filter_description, - const torch::Device& device); - - // 1. Remove the stream - void remove_stream(KeyType key); - - // Set discard - // The input timestamp must be expressed in AV_TIME_BASE unit. - void set_discard_timestamp(int64_t timestamp); - - void set_decoder( - const AVCodecParameters* codecpar, - const std::optional& decoder_name, - const std::optional& decoder_option, - const torch::Device& device); - - ////////////////////////////////////////////////////////////////////////////// - // Query methods - ////////////////////////////////////////////////////////////////////////////// - [[nodiscard]] std::string get_filter_description(KeyType key) const; - [[nodiscard]] FilterGraphOutputInfo get_filter_output_info(KeyType key) const; - - bool is_buffer_ready() const; - [[nodiscard]] bool is_decoder_set() const; - - ////////////////////////////////////////////////////////////////////////////// - // The streaming process - ////////////////////////////////////////////////////////////////////////////// - // 1. decode the input frame - // 2. pass the decoded data to filters - // 3. each filter store the result to the corresponding buffer - // - Sending NULL will drain (flush) the internal - int process_packet(AVPacket* packet); - - // flush the internal buffer of decoder. - // To be use when seeking - void flush(); - - private: - int send_frame(AVFrame* pFrame); - - ////////////////////////////////////////////////////////////////////////////// - // Retrieval - ////////////////////////////////////////////////////////////////////////////// - public: - // Get the chunk from the given filter result - std::optional pop_chunk(KeyType key); -}; - -} // namespace io -} // namespace torio diff --git a/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp b/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp deleted file mode 100644 index 39fd7cee0b..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp +++ /dev/null @@ -1,612 +0,0 @@ -#include -#include -#include -#include -#include - -namespace torio::io { - -using KeyType = StreamProcessor::KeyType; - -////////////////////////////////////////////////////////////////////////////// -// Initialization / resource allocations -////////////////////////////////////////////////////////////////////////////// -namespace { -AVFormatContext* get_input_format_context( - const std::string& src, - const std::optional& format, - const std::optional& option, - AVIOContext* io_ctx) { - AVFormatContext* p = avformat_alloc_context(); - TORCH_CHECK(p, "Failed to allocate AVFormatContext."); - if (io_ctx) { - p->pb = io_ctx; - } - - auto* pInputFormat = [&format]() -> AVFORMAT_CONST AVInputFormat* { - if (format.has_value()) { - std::string format_str = format.value(); - AVFORMAT_CONST AVInputFormat* pInput = - av_find_input_format(format_str.c_str()); - TORCH_CHECK(pInput, "Unsupported device/format: \"", format_str, "\""); - return pInput; - } - return nullptr; - }(); - - AVDictionary* opt = get_option_dict(option); - int ret = avformat_open_input(&p, src.c_str(), pInputFormat, &opt); - clean_up_dict(opt); - - TORCH_CHECK( - ret >= 0, - "Failed to open the input \"", - src, - "\" (", - av_err2string(ret), - ")."); - return p; -} -} // namespace - -StreamingMediaDecoder::StreamingMediaDecoder(AVFormatContext* p) - : format_ctx(p) { - C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamingMediaDecoder"); - int ret = avformat_find_stream_info(format_ctx, nullptr); - TORCH_CHECK( - ret >= 0, "Failed to find stream information: ", av_err2string(ret)); - - processors = - std::vector>(format_ctx->nb_streams); - for (int i = 0; i < format_ctx->nb_streams; ++i) { - switch (format_ctx->streams[i]->codecpar->codec_type) { - case AVMEDIA_TYPE_AUDIO: - case AVMEDIA_TYPE_VIDEO: - break; - default: - format_ctx->streams[i]->discard = AVDISCARD_ALL; - } - } -} - -StreamingMediaDecoder::StreamingMediaDecoder( - AVIOContext* io_ctx, - const std::optional& format, - const std::optional& option) - : StreamingMediaDecoder(get_input_format_context( - "Custom Input Context", - format, - option, - io_ctx)) {} - -StreamingMediaDecoder::StreamingMediaDecoder( - const std::string& src, - const std::optional& format, - const std::optional& option) - : StreamingMediaDecoder( - get_input_format_context(src, format, option, nullptr)) {} - -////////////////////////////////////////////////////////////////////////////// -// Helper methods -////////////////////////////////////////////////////////////////////////////// -void validate_open_stream(AVFormatContext* format_ctx) { - TORCH_CHECK(format_ctx, "Stream is not open."); -} - -void validate_src_stream_index(AVFormatContext* format_ctx, int i) { - validate_open_stream(format_ctx); - TORCH_CHECK( - i >= 0 && i < static_cast(format_ctx->nb_streams), - "Source stream index out of range"); -} - -void validate_src_stream_type( - AVFormatContext* format_ctx, - int i, - AVMediaType type) { - validate_src_stream_index(format_ctx, i); - TORCH_CHECK( - format_ctx->streams[i]->codecpar->codec_type == type, - "Stream ", - i, - " is not ", - av_get_media_type_string(type), - " stream."); -} - -//////////////////////////////////////////////////////////////////////////////// -// Query methods -//////////////////////////////////////////////////////////////////////////////// -int64_t StreamingMediaDecoder::num_src_streams() const { - return format_ctx->nb_streams; -} - -namespace { -OptionDict parse_metadata(const AVDictionary* metadata) { - AVDictionaryEntry* tag = nullptr; - OptionDict ret; - while ((tag = av_dict_get(metadata, "", tag, AV_DICT_IGNORE_SUFFIX))) { - ret.emplace(std::string(tag->key), std::string(tag->value)); - } - return ret; -} -} // namespace - -OptionDict StreamingMediaDecoder::get_metadata() const { - return parse_metadata(format_ctx->metadata); -} - -SrcStreamInfo StreamingMediaDecoder::get_src_stream_info(int i) const { - validate_src_stream_index(format_ctx, i); - - AVStream* stream = format_ctx->streams[i]; - AVCodecParameters* codecpar = stream->codecpar; - - SrcStreamInfo ret; - ret.media_type = codecpar->codec_type; - ret.bit_rate = codecpar->bit_rate; - ret.num_frames = stream->nb_frames; - ret.bits_per_sample = codecpar->bits_per_raw_sample; - ret.metadata = parse_metadata(stream->metadata); - const AVCodecDescriptor* desc = avcodec_descriptor_get(codecpar->codec_id); - if (desc) { - ret.codec_name = desc->name; - ret.codec_long_name = desc->long_name; - } - - switch (codecpar->codec_type) { - case AVMEDIA_TYPE_AUDIO: { - AVSampleFormat smp_fmt = static_cast(codecpar->format); - if (smp_fmt != AV_SAMPLE_FMT_NONE) { - ret.fmt_name = av_get_sample_fmt_name(smp_fmt); - } - ret.sample_rate = static_cast(codecpar->sample_rate); - ret.num_channels = codecpar->channels; - break; - } - case AVMEDIA_TYPE_VIDEO: { - AVPixelFormat pix_fmt = static_cast(codecpar->format); - if (pix_fmt != AV_PIX_FMT_NONE) { - ret.fmt_name = av_get_pix_fmt_name(pix_fmt); - } - ret.width = codecpar->width; - ret.height = codecpar->height; - ret.frame_rate = av_q2d(stream->r_frame_rate); - break; - } - default:; - } - return ret; -} - -namespace { -AVCodecParameters* get_codecpar() { - AVCodecParameters* ptr = avcodec_parameters_alloc(); - TORCH_CHECK(ptr, "Failed to allocate resource."); - return ptr; -} -} // namespace - -StreamParams StreamingMediaDecoder::get_src_stream_params(int i) { - validate_src_stream_index(format_ctx, i); - AVStream* stream = format_ctx->streams[i]; - - AVCodecParametersPtr codec_params(get_codecpar()); - int ret = avcodec_parameters_copy(codec_params, stream->codecpar); - TORCH_CHECK( - ret >= 0, - "Failed to copy the stream's codec parameters. (", - av_err2string(ret), - ")"); - return {std::move(codec_params), stream->time_base, i}; -} - -int64_t StreamingMediaDecoder::num_out_streams() const { - return static_cast(stream_indices.size()); -} - -OutputStreamInfo StreamingMediaDecoder::get_out_stream_info(int i) const { - TORCH_CHECK( - i >= 0 && static_cast(i) < stream_indices.size(), - "Output stream index out of range"); - int i_src = stream_indices[i].first; - KeyType key = stream_indices[i].second; - FilterGraphOutputInfo info = processors[i_src]->get_filter_output_info(key); - - OutputStreamInfo ret; - ret.source_index = i_src; - ret.filter_description = processors[i_src]->get_filter_description(key); - ret.media_type = info.type; - ret.format = info.format; - switch (info.type) { - case AVMEDIA_TYPE_AUDIO: - ret.sample_rate = info.sample_rate; - ret.num_channels = info.num_channels; - break; - case AVMEDIA_TYPE_VIDEO: - ret.width = info.width; - ret.height = info.height; - ret.frame_rate = info.frame_rate; - break; - default:; - } - return ret; -} - -int64_t StreamingMediaDecoder::find_best_audio_stream() const { - return av_find_best_stream( - format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0); -} - -int64_t StreamingMediaDecoder::find_best_video_stream() const { - return av_find_best_stream( - format_ctx, AVMEDIA_TYPE_VIDEO, -1, -1, nullptr, 0); -} - -bool StreamingMediaDecoder::is_buffer_ready() const { - if (processors.empty()) { - // If no decoding output streams exist, then determine overall readiness - // from the readiness of packet buffer. - return packet_buffer->has_packets(); - } else { - // Otherwise, determine readiness solely from the readiness of the decoding - // output streams. - for (const auto& it : processors) { - if (it && !it->is_buffer_ready()) { - return false; - } - } - } - return true; -} - -//////////////////////////////////////////////////////////////////////////////// -// Configure methods -//////////////////////////////////////////////////////////////////////////////// -void StreamingMediaDecoder::seek(double timestamp_s, int64_t mode) { - TORCH_CHECK(timestamp_s >= 0, "timestamp must be non-negative."); - TORCH_CHECK( - format_ctx->nb_streams > 0, - "At least one stream must exist in this context"); - - int64_t timestamp_av_tb = static_cast(timestamp_s * AV_TIME_BASE); - - int flag = AVSEEK_FLAG_BACKWARD; - switch (mode) { - case 0: - // reset seek_timestap as it is only used for precise seek - seek_timestamp = 0; - break; - case 1: - flag |= AVSEEK_FLAG_ANY; - // reset seek_timestap as it is only used for precise seek - seek_timestamp = 0; - break; - case 2: - seek_timestamp = timestamp_av_tb; - break; - default: - TORCH_CHECK(false, "Invalid mode value: ", mode); - } - - int ret = av_seek_frame(format_ctx, -1, timestamp_av_tb, flag); - - if (ret < 0) { - seek_timestamp = 0; - TORCH_CHECK(false, "Failed to seek. (" + av_err2string(ret) + ".)"); - } - for (const auto& it : processors) { - if (it) { - it->flush(); - it->set_discard_timestamp(seek_timestamp); - } - } -} - -void StreamingMediaDecoder::add_audio_stream( - int64_t i, - int64_t frames_per_chunk, - int64_t num_chunks, - const std::optional& filter_desc, - const std::optional& decoder, - const std::optional& decoder_option) { - add_stream( - static_cast(i), - AVMEDIA_TYPE_AUDIO, - static_cast(frames_per_chunk), - static_cast(num_chunks), - filter_desc.value_or("anull"), - decoder, - decoder_option, - torch::Device(torch::DeviceType::CPU)); -} - -void StreamingMediaDecoder::add_video_stream( - int64_t i, - int64_t frames_per_chunk, - int64_t num_chunks, - const std::optional& filter_desc, - const std::optional& decoder, - const std::optional& decoder_option, - const std::optional& hw_accel) { - const torch::Device device = [&]() { - if (!hw_accel) { - return torch::Device{c10::DeviceType::CPU}; - } -#ifdef USE_CUDA - torch::Device d{hw_accel.value()}; - TORCH_CHECK( - d.is_cuda(), "Only CUDA is supported for HW acceleration. Found: ", d); - return d; -#else - TORCH_CHECK( - false, - "torchaudio is not compiled with CUDA support. Hardware acceleration is not available."); -#endif - }(); - - add_stream( - static_cast(i), - AVMEDIA_TYPE_VIDEO, - static_cast(frames_per_chunk), - static_cast(num_chunks), - filter_desc.value_or("null"), - decoder, - decoder_option, - device); -} - -void StreamingMediaDecoder::add_packet_stream(int i) { - validate_src_stream_index(format_ctx, i); - if (!packet_buffer) { - packet_buffer = std::make_unique(); - } - packet_stream_indices.emplace(i); -} - -void StreamingMediaDecoder::add_stream( - int i, - AVMediaType media_type, - int frames_per_chunk, - int num_chunks, - const std::string& filter_desc, - const std::optional& decoder, - const std::optional& decoder_option, - const torch::Device& device) { - validate_src_stream_type(format_ctx, i, media_type); - - AVStream* stream = format_ctx->streams[i]; - // When media source is file-like object, it is possible that source codec - // is not detected properly. - TORCH_CHECK( - stream->codecpar->format != -1, - "Failed to detect the source stream format."); - - if (!processors[i]) { - processors[i] = std::make_unique(stream->time_base); - processors[i]->set_discard_timestamp(seek_timestamp); - } - if (!processors[i]->is_decoder_set()) { - processors[i]->set_decoder( - stream->codecpar, decoder, decoder_option, device); - } else { - TORCH_CHECK( - !decoder && (!decoder_option || decoder_option.value().size() == 0), - "Decoder options were provided, but the decoder has already been initialized.") - } - - stream->discard = AVDISCARD_DEFAULT; - - auto frame_rate = [&]() -> AVRational { - switch (media_type) { - case AVMEDIA_TYPE_AUDIO: - return AVRational{0, 1}; - case AVMEDIA_TYPE_VIDEO: - return av_guess_frame_rate(format_ctx, stream, nullptr); - default: - TORCH_INTERNAL_ASSERT( - false, - "Unexpected media type is given: ", - av_get_media_type_string(media_type)); - } - }(); - int key = processors[i]->add_stream( - frames_per_chunk, num_chunks, frame_rate, filter_desc, device); - stream_indices.push_back(std::make_pair<>(i, key)); -} - -void StreamingMediaDecoder::remove_stream(int64_t i) { - TORCH_CHECK( - i >= 0 && static_cast(i) < stream_indices.size(), - "Output stream index out of range"); - auto it = stream_indices.begin() + i; - int iP = it->first; - processors[iP]->remove_stream(it->second); - stream_indices.erase(it); - - // Check if the processor is still refered and if not, disable the processor - bool still_used = false; - for (auto& p : stream_indices) { - still_used |= (iP == p.first); - if (still_used) { - break; - } - } - if (!still_used) { - processors[iP].reset(nullptr); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Stream methods -//////////////////////////////////////////////////////////////////////////////// -// Note -// return value (to be finalized) -// 0: caller should keep calling this function -// 1: It's done, caller should stop calling -// <0: Some error happened -int StreamingMediaDecoder::process_packet() { - int ret = av_read_frame(format_ctx, packet); - if (ret == AVERROR_EOF) { - ret = drain(); - return (ret < 0) ? ret : 1; - } - if (ret < 0) { - return ret; - } - AutoPacketUnref auto_unref{packet}; - - int stream_index = packet->stream_index; - - if (packet_stream_indices.count(stream_index)) { - packet_buffer->push_packet(packet); - } - - auto& processor = processors[stream_index]; - if (!processor) { - return 0; - } - - ret = processor->process_packet(packet); - - return (ret < 0) ? ret : 0; -} - -// Similar to `process_packet()`, but in case process_packet returns EAGAIN, -// it keeps retrying until timeout happens, -// -// timeout and backoff is given in millisecond -int StreamingMediaDecoder::process_packet_block( - double timeout, - double backoff) { - auto dead_line = [&]() { - // If timeout < 0, then it repeats forever - if (timeout < 0) { - return std::chrono::time_point::max(); - } - auto timeout_ = static_cast(1000 * timeout); - return std::chrono::steady_clock::now() + - std::chrono::microseconds{timeout_}; - }(); - - std::chrono::microseconds sleep{static_cast(1000 * backoff)}; - - while (true) { - int ret = process_packet(); - if (ret != AVERROR(EAGAIN)) { - return ret; - } - if (dead_line < std::chrono::steady_clock::now()) { - return ret; - } - // FYI: ffmpeg sleeps 10 milli seconds if the read happens in a separate - // thread - // https://github.com/FFmpeg/FFmpeg/blob/b0f8dbb0cacc45a19f18c043afc706d7d26bef74/fftools/ffmpeg.c#L3952 - // https://github.com/FFmpeg/FFmpeg/blob/b0f8dbb0cacc45a19f18c043afc706d7d26bef74/fftools/ffmpeg.c#L4542 - // - std::this_thread::sleep_for(sleep); - } -} - -void StreamingMediaDecoder::process_all_packets() { - int64_t ret = 0; - do { - ret = process_packet(); - } while (!ret); -} - -int StreamingMediaDecoder::process_packet( - const std::optional& timeout, - const double backoff) { - int code = [&]() -> int { - if (timeout.has_value()) { - return process_packet_block(timeout.value(), backoff); - } - return process_packet(); - }(); - TORCH_CHECK( - code >= 0, "Failed to process a packet. (" + av_err2string(code) + "). "); - return code; -} - -int StreamingMediaDecoder::fill_buffer( - const std::optional& timeout, - const double backoff) { - while (!is_buffer_ready()) { - int code = process_packet(timeout, backoff); - if (code != 0) { - return code; - } - } - return 0; -} - -// <0: Some error happened. -int StreamingMediaDecoder::drain() { - int ret = 0, tmp = 0; - for (auto& p : processors) { - if (p) { - tmp = p->process_packet(nullptr); - if (tmp < 0) { - ret = tmp; - } - } - } - return ret; -} - -std::vector> StreamingMediaDecoder::pop_chunks() { - std::vector> ret; - ret.reserve(static_cast(num_out_streams())); - for (auto& i : stream_indices) { - ret.emplace_back(processors[i.first]->pop_chunk(i.second)); - } - return ret; -} - -std::vector StreamingMediaDecoder::pop_packets() { - return packet_buffer->pop_packets(); -} - -////////////////////////////////////////////////////////////////////////////// -// StreamingMediaDecoderCustomIO -////////////////////////////////////////////////////////////////////////////// - -namespace detail { -namespace { -AVIOContext* get_io_context( - void* opaque, - int buffer_size, - int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = static_cast(av_malloc(buffer_size)); - TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = avio_alloc_context( - buffer, buffer_size, 0, opaque, read_packet, nullptr, seek); - if (!io_ctx) { - av_freep(&buffer); - TORCH_CHECK(false, "Failed to allocate AVIOContext."); - } - return io_ctx; -} -} // namespace - -CustomInput::CustomInput( - void* opaque, - int buffer_size, - int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)) - : io_ctx(get_io_context(opaque, buffer_size, read_packet, seek)) {} -} // namespace detail - -StreamingMediaDecoderCustomIO::StreamingMediaDecoderCustomIO( - void* opaque, - const std::optional& format, - int buffer_size, - int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence), - const std::optional& option) - : CustomInput(opaque, buffer_size, read_packet, seek), - StreamingMediaDecoder(io_ctx, format, option) {} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_reader/stream_reader.h b/src/libtorio/ffmpeg/stream_reader/stream_reader.h deleted file mode 100644 index a8e1d9f065..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/stream_reader.h +++ /dev/null @@ -1,399 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -namespace torio { -namespace io { - -////////////////////////////////////////////////////////////////////////////// -// StreamingMediaDecoder -////////////////////////////////////////////////////////////////////////////// - -/// -/// Fetch and decode audio/video streams chunk by chunk. -/// -class StreamingMediaDecoder { - AVFormatInputContextPtr format_ctx; - AVPacketPtr packet{alloc_avpacket()}; - - std::vector> processors; - // Mapping from user-facing stream index to internal index. - // The first one is processor index, - // the second is the map key inside of processor. - std::vector> stream_indices; - - // For supporting reading raw packets. - std::unique_ptr packet_buffer; - // Set of source stream indices to read packets for. - std::unordered_set packet_stream_indices; - - // timestamp to seek to expressed in AV_TIME_BASE - // - // 0 : No seek - // Positive value: Skip AVFrames with timestamps before it - // Negative value: UB. Should not happen - // - // Note: - // When precise seek is performed, this value is set to the value provided - // by client code, and PTS values of decoded frames are compared against it - // to determine whether the frames should be passed to downstream. - int64_t seek_timestamp = 0; - - /// @name Constructors - /// - ///@{ - - /// @cond - - private: - /// Construct StreamingMediaDecoder from already initialized AVFormatContext. - /// This is a low level constructor interact with FFmpeg directly. - /// One can provide custom AVFormatContext in case the other constructor - /// does not meet a requirement. - /// @param format_ctx An initialized AVFormatContext. StreamingMediaDecoder - /// will own the resources and release it at the end. - explicit StreamingMediaDecoder(AVFormatContext* format_ctx); - - protected: - /// Concstruct media processor from custom IO. - /// - /// @param io_ctx Custom IO Context. - /// @param format Specifies format, such as mp4. - /// @param option Custom option passed when initializing format context - /// (opening source). - explicit StreamingMediaDecoder( - AVIOContext* io_ctx, - const std::optional& format = std::nullopt, - const std::optional& option = std::nullopt); - - /// @endcond - - public: - /// Construct media processor from soruce URI. - /// - /// @param src URL of source media, in the format FFmpeg can understand. - /// @param format Specifies format (such as mp4) or device (such as lavfi and - /// avfoundation) - /// @param option Custom option passed when initializing format context - /// (opening source). - explicit StreamingMediaDecoder( - const std::string& src, - const std::optional& format = std::nullopt, - const std::optional& option = std::nullopt); - - ///@} - - /// @cond - - ~StreamingMediaDecoder() = default; - // Non-copyable - StreamingMediaDecoder(const StreamingMediaDecoder&) = delete; - StreamingMediaDecoder& operator=(const StreamingMediaDecoder&) = delete; - // Movable - StreamingMediaDecoder(StreamingMediaDecoder&&) = default; - StreamingMediaDecoder& operator=(StreamingMediaDecoder&&) = default; - - /// @endcond - - ////////////////////////////////////////////////////////////////////////////// - // Query methods - ////////////////////////////////////////////////////////////////////////////// - public: - /// @name Query methods - ///@{ - - /// Find a suitable audio stream using heuristics from ffmpeg. - /// - /// If successful, the index of the best stream (>=0) is returned. - /// Otherwise a negative value is returned. - int64_t find_best_audio_stream() const; - /// Find a suitable video stream using heuristics from ffmpeg. - /// - /// If successful, the index of the best stream (0>=) is returned. - /// otherwise a negative value is returned. - int64_t find_best_video_stream() const; - /// Fetch metadata of the source media. - OptionDict get_metadata() const; - /// Fetch the number of source streams found in the input media. - /// - /// The source streams include not only audio/video streams but also - /// subtitle and others. - int64_t num_src_streams() const; - /// Fetch information about the specified source stream. - /// - /// The valid value range is ``[0, num_src_streams())``. - SrcStreamInfo get_src_stream_info(int i) const; - /// Fetch the number of output streams defined by client code. - int64_t num_out_streams() const; - /// Fetch information about the specified output stream. - /// - /// The valid value range is ``[0, num_out_streams())``. - OutputStreamInfo get_out_stream_info(int i) const; - /// Check if all the buffers of the output streams have enough decoded frames. - bool is_buffer_ready() const; - - /// @cond - /// Get source stream parameters. Necessary on the write side for packet - /// passthrough. - /// - /// @param i Source stream index. - StreamParams get_src_stream_params(int i); - /// @endcond - - ///@} - - ////////////////////////////////////////////////////////////////////////////// - // Configure methods - ////////////////////////////////////////////////////////////////////////////// - /// @name Configure methods - ///@{ - - /// Define an output audio stream. - /// - /// @param i The index of the source stream. - /// - /// @param frames_per_chunk Number of frames returned as one chunk. - /// @parblock - /// If a source stream is exhausted before ``frames_per_chunk`` frames - /// are buffered, the chunk is returned as-is. Thus the number of frames - /// in the chunk may be smaller than ````frames_per_chunk``. - /// - /// Providing ``-1`` disables chunking, in which case, method - /// ``pop_chunks()`` returns all the buffered frames as one chunk. - /// @endparblock - /// - /// @param num_chunks Internal buffer size. - /// @parblock - /// When the number of buffered chunks exceeds this number, old chunks are - /// dropped. For example, if `frames_per_chunk` is 5 and `buffer_chunk_size` - /// is 3, then frames older than 15 are dropped. - /// - /// Providing ``-1`` disables this behavior, forcing the retention of all - /// chunks. - /// @endparblock - /// - /// @param filter_desc Description of filter graph applied to the source - /// stream. - /// - /// @param decoder The name of the decoder to be used. - /// When provided, use the specified decoder instead of the default one. - /// - /// @param decoder_option Options passed to decoder. - /// @parblock - /// To list decoder options for a decoder, you can use - /// `ffmpeg -h decoder=` command. - /// - /// In addition to decoder-specific options, you can also pass options - /// related to multithreading. They are effective only if the decoder - /// supports them. If neither of them are provided, StreamingMediaDecoder - /// defaults to single thread. - /// - ``"threads"``: The number of threads or the value ``"0"`` - /// to let FFmpeg decide based on its heuristics. - /// - ``"thread_type"``: Which multithreading method to use. - /// The valid values are ``"frame"`` or ``"slice"``. - /// Note that each decoder supports a different set of methods. - /// If not provided, a default value is used. - /// - ``"frame"``: Decode more than one frame at once. - /// Each thread handles one frame. - /// This will increase decoding delay by one frame per thread - /// - ``"slice"``: Decode more than one part of a single frame at once. - /// @endparblock - void add_audio_stream( - int64_t i, - int64_t frames_per_chunk, - int64_t num_chunks, - const std::optional& filter_desc = std::nullopt, - const std::optional& decoder = std::nullopt, - const std::optional& decoder_option = std::nullopt); - /// Define an output video stream. - /// - /// @param i,frames_per_chunk,num_chunks,filter_desc,decoder,decoder_option - /// See `add_audio_stream()`. - /// - /// @param hw_accel Enable hardware acceleration. - /// @parblock - /// When video is decoded on CUDA hardware, (for example by specifying - /// `"h264_cuvid"` decoder), passing CUDA device indicator to ``hw_accel`` - /// (i.e. ``hw_accel="cuda:0"``) will make StreamingMediaDecoder place the - /// resulting frames directly on the specified CUDA device as a CUDA tensor. - /// - /// If `None`, the chunk will be moved to CPU memory. - /// @endparblock - void add_video_stream( - int64_t i, - int64_t frames_per_chunk, - int64_t num_chunks, - const std::optional& filter_desc = std::nullopt, - const std::optional& decoder = std::nullopt, - const std::optional& decoder_option = std::nullopt, - const std::optional& hw_accel = std::nullopt); - - /// @cond - /// Add a output packet stream. - /// Allows for passing packets directly from the source stream, bypassing - /// the decode path, to ``StreamingMediaEncoder`` for remuxing. - /// - /// @param i The index of the source stream. - void add_packet_stream(int i); - /// @endcond - - /// Remove an output stream. - /// - /// @param i The index of the output stream to be removed. - /// The valid value range is `[0, num_out_streams())`. - void remove_stream(int64_t i); - - ///@} - - private: - void add_stream( - int i, - AVMediaType media_type, - int frames_per_chunk, - int num_chunks, - const std::string& filter_desc, - const std::optional& decoder, - const std::optional& decoder_option, - const torch::Device& device); - - ////////////////////////////////////////////////////////////////////////////// - // Stream methods - ////////////////////////////////////////////////////////////////////////////// - public: - /// @name Stream methods - ///@{ - - /// Seek into the given time stamp. - /// - /// @param timestamp Target time stamp in second. - /// @param mode Seek mode. - /// - ``0``: Keyframe mode. Seek into nearest key frame before the given - /// timestamp. - /// - ``1``: Any mode. Seek into any frame (including non-key frames) before - /// the given timestamp. - /// - ``2``: Precise mode. First seek into the nearest key frame before the - /// given timestamp, then decode frames until it reaches the frame closest - /// to the given timestamp. - void seek(double timestamp, int64_t mode); - - /// Demultiplex and process one packet. - /// - /// @return - /// - ``0``: A packet was processed successfully and there are still - /// packets left in the stream, so client code can call this method again. - /// - ``1``: A packet was processed successfully and it reached EOF. - /// Client code should not call this method again. - /// - ``<0``: An error has happened. - int process_packet(); - /// Similar to `process_packet()`, but in case it fails due to resource - /// temporarily being unavailable, it automatically retries. - /// - /// This behavior is helpful when using device input, such as a microphone, - /// during which the buffer may be busy while sample acquisition is happening. - /// - /// @param timeout Timeout in milli seconds. - /// - ``>=0``: Keep retrying until the given time passes. - /// - ``<0``: Keep retrying forever. - /// @param backoff Time to wait before retrying in milli seconds. - int process_packet_block(const double timeout, const double backoff); - - /// @cond - // High-level method used by Python bindings. - int process_packet( - const std::optional& timeout, - const double backoff); - /// @endcond - - /// Process packets unitl EOF - void process_all_packets(); - - /// Process packets until all the chunk buffers have at least one chunk - /// - /// @param timeout See `process_packet_block()` - /// @param backoff See `process_packet_block()` - int fill_buffer( - const std::optional& timeout = std::nullopt, - const double backoff = 10.); - - ///@} - - private: - int drain(); - - ////////////////////////////////////////////////////////////////////////////// - // Retrieval - ////////////////////////////////////////////////////////////////////////////// - public: - /// @name Retrieval methods - ///@{ - - /// Pop one chunk from each output stream if it is available. - std::vector> pop_chunks(); - - /// @cond - /// Pop packets from buffer, if available. - std::vector pop_packets(); - /// @endcond - ///@} -}; - -////////////////////////////////////////////////////////////////////////////// -// StreamingMediaDecoderCustomIO -////////////////////////////////////////////////////////////////////////////// - -/// @cond - -namespace detail { -struct CustomInput { - AVIOContextPtr io_ctx; - CustomInput( - void* opaque, - int buffer_size, - int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)); -}; -} // namespace detail - -/// @endcond - -/// -/// A subclass of StreamingMediaDecoder which works with custom read function. -/// Can be used for decoding media from memory or custom object. -/// -class StreamingMediaDecoderCustomIO : private detail::CustomInput, - public StreamingMediaDecoder { - public: - /// - /// Construct StreamingMediaDecoder with custom read and seek functions. - /// - /// @param opaque Custom data used by ``read_packet`` and ``seek`` functions. - /// @param format Specify input format. - /// @param buffer_size The size of the intermediate buffer, which FFmpeg uses - /// to pass data to function read_packet. - /// @param read_packet Custom read function that is called from FFmpeg to - /// read data from the destination. - /// @param seek Optional seek function that is used to seek the destination. - /// @param option Custom option passed when initializing format context. - StreamingMediaDecoderCustomIO( - void* opaque, - const std::optional& format, - int buffer_size, - int (*read_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence) = nullptr, - const std::optional& option = std::nullopt); -}; - -// For BC -using StreamReader = StreamingMediaDecoder; -using StreamReaderCustomIO = StreamingMediaDecoderCustomIO; - -} // namespace io -} // namespace torio - -// For BC -namespace torchaudio::io { -using namespace torio::io; -} // namespace torchaudio::io diff --git a/src/libtorio/ffmpeg/stream_reader/typedefs.h b/src/libtorio/ffmpeg/stream_reader/typedefs.h deleted file mode 100644 index ee928be048..0000000000 --- a/src/libtorio/ffmpeg/stream_reader/typedefs.h +++ /dev/null @@ -1,165 +0,0 @@ -#pragma once - -#include -#include - -namespace torio { -namespace io { - -/// Information about source stream found in the input media. -struct SrcStreamInfo { - /// @name COMMON MEMBERS - ///@{ - - /// - /// The stream media type. - /// - /// Please see refer to - /// [the FFmpeg - /// documentation](https://ffmpeg.org/doxygen/4.1/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48) - /// for the available values - /// - /// @todo Introduce own enum and get rid of FFmpeg dependency - /// - AVMediaType media_type; - /// The name of codec. - const char* codec_name = "N/A"; - /// The name of codec in long, human friendly form. - const char* codec_long_name = "N/A"; - /// For audio, it is sample format. - /// - /// Commonly found values are; - /// - ``"u8"``, ``"u8p"``: 8-bit unsigned integer. - /// - ``"s16"``, ``"s16p"``: 16-bit signed integer. - /// - ``"s32"``, ``"s32p"``: 32-bit signed integer. - /// - ``"s64"``, ``"s64p"``: 64-bit signed integer. - /// - ``"flt"``, ``"fltp"``: 32-bit floating point. - /// - ``"dbl"``, ``"dblp"``: 64-bit floating point. - /// - /// For video, it is color channel format. - /// - /// Commonly found values include; - /// - ``"gray8"``: grayscale - /// - ``"rgb24"``: RGB - /// - ``"bgr24"``: BGR - /// - ``"yuv420p"``: YUV420p - const char* fmt_name = "N/A"; - - /// Bit rate - int64_t bit_rate = 0; - - /// Number of frames. - /// @note In some formats, the value is not reliable or unavailable. - int64_t num_frames = 0; - - /// Bits per sample - int bits_per_sample = 0; - - /// Metadata - /// - /// This method can fetch ID3 tag from MP3. - /// - /// Example: - /// - /// ``` - /// { - /// "title": "foo", - /// "artist": "bar", - /// "date": "2017" - /// } - /// ``` - OptionDict metadata{}; - - ///@} - - /// @name AUDIO-SPECIFIC MEMBERS - ///@{ - - /// Sample rate - double sample_rate = 0; - - /// The number of channels - int num_channels = 0; - - ///@} - - /// @name VIDEO-SPECIFIC MEMBERS - ///@{ - - /// Width - int width = 0; - - /// Height - int height = 0; - - /// Frame rate - double frame_rate = 0; - ///@} -}; - -/// Information about output stream configured by user code -struct OutputStreamInfo { - /// The index of the input source stream - int source_index; - - /// - /// The stream media type. - /// - /// Please see refer to - /// [the FFmpeg - /// documentation](https://ffmpeg.org/doxygen/4.1/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48) - /// for the available values - /// - /// @todo Introduce own enum and get rid of FFmpeg dependency - /// - AVMediaType media_type = AVMEDIA_TYPE_UNKNOWN; - /// Media format. AVSampleFormat for audio or AVPixelFormat for video. - int format = -1; - - /// Filter graph definition, such as - /// ``"aresample=16000,aformat=sample_fmts=fltp"``. - std::string filter_description{}; - - /// @name AUDIO-SPECIFIC MEMBERS - ///@{ - - /// Sample rate - double sample_rate = -1; - - /// The number of channels - int num_channels = -1; - - ///@} - - /// @name VIDEO-SPECIFIC MEMBERS - ///@{ - - /// Width - int width = -1; - - /// Height - int height = -1; - - /// Frame rate - AVRational frame_rate{0, 1}; - - ///@} -}; - -/// Stores decoded frames and metadata -struct Chunk { - /// Audio/video frames. - /// - /// For audio, the shape is ``[time, num_channels]``, and the ``dtype`` - /// depends on output stream configurations. - /// - /// For video, the shape is ``[time, channel, height, width]``, and - /// the ``dtype`` is ``torch.uint8``. - torch::Tensor frames; - /// - /// Presentation time stamp of the first frame, in second. - double pts; -}; - -} // namespace io -} // namespace torio diff --git a/src/libtorio/ffmpeg/stream_writer/encode_process.cpp b/src/libtorio/ffmpeg/stream_writer/encode_process.cpp deleted file mode 100644 index 9fce0ac909..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/encode_process.cpp +++ /dev/null @@ -1,976 +0,0 @@ -#include -#include -#include - -namespace torio::io { - -//////////////////////////////////////////////////////////////////////////////// -// EncodeProcess Logic Implementation -//////////////////////////////////////////////////////////////////////////////// - -EncodeProcess::EncodeProcess( - TensorConverter&& converter, - AVFramePtr&& frame, - FilterGraph&& filter_graph, - Encoder&& encoder, - AVCodecContextPtr&& codec_ctx) noexcept - : converter(std::move(converter)), - src_frame(std::move(frame)), - filter(std::move(filter_graph)), - encoder(std::move(encoder)), - codec_ctx(std::move(codec_ctx)) {} - -void EncodeProcess::process( - const torch::Tensor& tensor, - const std::optional& pts) { - if (pts) { - const double& pts_val = pts.value(); - TORCH_CHECK( - std::isfinite(pts_val) && pts_val >= 0.0, - "The value of PTS must be positive and finite. Found: ", - pts_val) - AVRational tb = codec_ctx->time_base; - auto val = static_cast(std::round(pts_val * tb.den / tb.num)); - if (src_frame->pts > val) { - TORCH_WARN_ONCE( - "The provided PTS value is smaller than the next expected value."); - } - src_frame->pts = val; - } - for (const auto& frame : converter.convert(tensor)) { - process_frame(frame); - frame->pts += frame->nb_samples; - } -} - -void EncodeProcess::process_frame(AVFrame* src) { - int ret = filter.add_frame(src); - while (ret >= 0) { - ret = filter.get_frame(dst_frame); - if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { - if (ret == AVERROR_EOF) { - encoder.encode(nullptr); - } - break; - } - if (ret >= 0) { - encoder.encode(dst_frame); - } - av_frame_unref(dst_frame); - } -} - -void EncodeProcess::flush() { - process_frame(nullptr); -} - -//////////////////////////////////////////////////////////////////////////////// -// EncodeProcess Initialization helper functions -//////////////////////////////////////////////////////////////////////////////// - -namespace { - -enum AVSampleFormat get_src_sample_fmt(const std::string& src) { - auto fmt = av_get_sample_fmt(src.c_str()); - if (fmt != AV_SAMPLE_FMT_NONE && !av_sample_fmt_is_planar(fmt)) { - return fmt; - } - TORCH_CHECK( - false, - "Unsupported sample fotmat (", - src, - ") was provided. Valid values are ", - []() -> std::string { - std::vector ret; - for (const auto& fmt : - {AV_SAMPLE_FMT_U8, - AV_SAMPLE_FMT_S16, - AV_SAMPLE_FMT_S32, - AV_SAMPLE_FMT_S64, - AV_SAMPLE_FMT_FLT, - AV_SAMPLE_FMT_DBL}) { - ret.emplace_back(av_get_sample_fmt_name(fmt)); - } - return c10::Join(", ", ret); - }(), - "."); -} - -const std::set SUPPORTED_PIX_FMTS{ - AV_PIX_FMT_GRAY8, - AV_PIX_FMT_RGB0, - AV_PIX_FMT_BGR0, - AV_PIX_FMT_RGB24, - AV_PIX_FMT_BGR24, - AV_PIX_FMT_YUV444P}; - -enum AVPixelFormat get_src_pix_fmt(const std::string& src) { - AVPixelFormat fmt = av_get_pix_fmt(src.c_str()); - TORCH_CHECK( - SUPPORTED_PIX_FMTS.count(fmt), - "Unsupported pixel format (", - src, - ") was provided. Valid values are ", - []() -> std::string { - std::vector ret; - for (const auto& fmt : SUPPORTED_PIX_FMTS) { - ret.emplace_back(av_get_pix_fmt_name(fmt)); - } - return c10::Join(", ", ret); - }(), - "."); - return fmt; -} - -//////////////////////////////////////////////////////////////////////////////// -// Codec & Codec context -//////////////////////////////////////////////////////////////////////////////// -const AVCodec* get_codec( - AVCodecID default_codec, - const std::optional& encoder) { - if (encoder) { - const AVCodec* c = avcodec_find_encoder_by_name(encoder.value().c_str()); - TORCH_CHECK(c, "Unexpected codec: ", encoder.value()); - return c; - } - const AVCodec* c = avcodec_find_encoder(default_codec); - TORCH_CHECK( - c, "Encoder not found for codec: ", avcodec_get_name(default_codec)); - return c; -} - -AVCodecContextPtr get_codec_ctx(const AVCodec* codec, int flags) { - AVCodecContext* ctx = avcodec_alloc_context3(codec); - TORCH_CHECK(ctx, "Failed to allocate CodecContext."); - - if (flags & AVFMT_GLOBALHEADER) { - ctx->flags |= AV_CODEC_FLAG_GLOBAL_HEADER; - } - return AVCodecContextPtr(ctx); -} - -void open_codec( - AVCodecContext* codec_ctx, - const std::optional& option) { - AVDictionary* opt = get_option_dict(option); - - // Enable experimental feature if required - // Note: - // "vorbis" refers to FFmpeg's native encoder, - // https://ffmpeg.org/doxygen/4.1/vorbisenc_8c.html#a8c2e524b0f125f045fef39c747561450 - // while "libvorbis" refers to the one depends on libvorbis, - // which is not experimental - // https://ffmpeg.org/doxygen/4.1/libvorbisenc_8c.html#a5dd5fc671e2df9c5b1f97b2ee53d4025 - // similarly, "opus" refers to FFmpeg's native encoder - // https://ffmpeg.org/doxygen/4.1/opusenc_8c.html#a05b203d4a9a231cc1fd5a7ddeb68cebc - // while "libopus" refers to the one depends on libopusenc - // https://ffmpeg.org/doxygen/4.1/libopusenc_8c.html#aa1d649e48cd2ec00cfe181cf9d0f3251 - if (std::strcmp(codec_ctx->codec->name, "vorbis") == 0) { - if (!av_dict_get(opt, "strict", nullptr, 0)) { - TORCH_WARN_ONCE( - "\"vorbis\" encoder is selected. Enabling '-strict experimental'. ", - "If this is not desired, please provide \"strict\" encoder option ", - "with desired value."); - av_dict_set(&opt, "strict", "experimental", 0); - } - } - if (std::strcmp(codec_ctx->codec->name, "opus") == 0) { - if (!av_dict_get(opt, "strict", nullptr, 0)) { - TORCH_WARN_ONCE( - "\"opus\" encoder is selected. Enabling '-strict experimental'. ", - "If this is not desired, please provide \"strict\" encoder option ", - "with desired value."); - av_dict_set(&opt, "strict", "experimental", 0); - } - } - - // Default to single thread execution. - if (!av_dict_get(opt, "threads", nullptr, 0)) { - av_dict_set(&opt, "threads", "1", 0); - } - - int ret = avcodec_open2(codec_ctx, codec_ctx->codec, &opt); - clean_up_dict(opt); - TORCH_CHECK(ret >= 0, "Failed to open codec: (", av_err2string(ret), ")"); -} - -//////////////////////////////////////////////////////////////////////////////// -// Audio codec -//////////////////////////////////////////////////////////////////////////////// - -bool supported_sample_fmt( - const AVSampleFormat fmt, - const AVSampleFormat* sample_fmts) { - if (!sample_fmts) { - return true; - } - while (*sample_fmts != AV_SAMPLE_FMT_NONE) { - if (fmt == *sample_fmts) { - return true; - } - ++sample_fmts; - } - return false; -} - -std::string get_supported_formats(const AVSampleFormat* sample_fmts) { - std::vector ret; - while (*sample_fmts != AV_SAMPLE_FMT_NONE) { - ret.emplace_back(av_get_sample_fmt_name(*sample_fmts)); - ++sample_fmts; - } - return c10::Join(", ", ret); -} - -AVSampleFormat get_enc_fmt( - AVSampleFormat src_fmt, - const std::optional& encoder_format, - const AVCodec* codec) { - if (encoder_format) { - auto& enc_fmt_val = encoder_format.value(); - auto fmt = av_get_sample_fmt(enc_fmt_val.c_str()); - TORCH_CHECK( - fmt != AV_SAMPLE_FMT_NONE, "Unknown sample format: ", enc_fmt_val); - TORCH_CHECK( - supported_sample_fmt(fmt, codec->sample_fmts), - codec->name, - " does not support ", - encoder_format.value(), - " format. Supported values are; ", - get_supported_formats(codec->sample_fmts)); - return fmt; - } - if (codec->sample_fmts) { - return codec->sample_fmts[0]; - } - return src_fmt; -}; - -bool supported_sample_rate(const int sample_rate, const AVCodec* codec) { - if (!codec->supported_samplerates) { - return true; - } - const int* it = codec->supported_samplerates; - while (*it) { - if (sample_rate == *it) { - return true; - } - ++it; - } - return false; -} - -std::string get_supported_samplerates(const int* supported_samplerates) { - std::vector ret; - if (supported_samplerates) { - while (*supported_samplerates) { - ret.push_back(*supported_samplerates); - ++supported_samplerates; - } - } - return c10::Join(", ", ret); -} - -int get_enc_sr( - int src_sample_rate, - const std::optional& encoder_sample_rate, - const AVCodec* codec) { - // G.722 only supports 16000 Hz, but it does not list the sample rate in - // supported_samplerates so we hard code it here. - if (codec->id == AV_CODEC_ID_ADPCM_G722) { - if (encoder_sample_rate) { - auto val = encoder_sample_rate.value(); - TORCH_CHECK( - val == 16'000, - codec->name, - " does not support sample rate ", - val, - ". Supported values are; 16000."); - } - return 16'000; - } - if (encoder_sample_rate) { - const int& encoder_sr = encoder_sample_rate.value(); - TORCH_CHECK( - encoder_sr > 0, - "Encoder sample rate must be positive. Found: ", - encoder_sr); - TORCH_CHECK( - supported_sample_rate(encoder_sr, codec), - codec->name, - " does not support sample rate ", - encoder_sr, - ". Supported values are; ", - get_supported_samplerates(codec->supported_samplerates)); - return encoder_sr; - } - if (codec->supported_samplerates && - !supported_sample_rate(src_sample_rate, codec)) { - return codec->supported_samplerates[0]; - } - return src_sample_rate; -} - -std::string get_supported_channels(const uint64_t* channel_layouts) { - std::vector names; - while (*channel_layouts) { - std::stringstream ss; - ss << av_get_channel_layout_nb_channels(*channel_layouts); - ss << " (" << av_get_channel_name(*channel_layouts) << ")"; - names.emplace_back(ss.str()); - ++channel_layouts; - } - return c10::Join(", ", names); -} - -uint64_t get_channel_layout( - const uint64_t src_ch_layout, - const std::optional enc_num_channels, - const AVCodec* codec) { - // If the override is presented, and if it is supported by codec, we use it. - if (enc_num_channels) { - const int& val = enc_num_channels.value(); - TORCH_CHECK( - val > 0, "The number of channels must be greater than 0. Found: ", val); - if (!codec->channel_layouts) { - return static_cast(av_get_default_channel_layout(val)); - } - for (const uint64_t* it = codec->channel_layouts; *it; ++it) { - if (av_get_channel_layout_nb_channels(*it) == val) { - return *it; - } - } - TORCH_CHECK( - false, - "Codec ", - codec->name, - " does not support a channel layout consists of ", - val, - " channels. Supported values are: ", - get_supported_channels(codec->channel_layouts)); - } - // If the codec does not have restriction on channel layout, we reuse the - // source channel layout - if (!codec->channel_layouts) { - return src_ch_layout; - } - // If the codec has restriction, and source layout is supported, we reuse the - // source channel layout - for (const uint64_t* it = codec->channel_layouts; *it; ++it) { - if (*it == src_ch_layout) { - return src_ch_layout; - } - } - // Use the default layout of the codec. - return codec->channel_layouts[0]; -} - -void configure_audio_codec_ctx( - AVCodecContext* codec_ctx, - AVSampleFormat format, - int sample_rate, - uint64_t channel_layout, - const std::optional& codec_config) { - codec_ctx->sample_fmt = format; - codec_ctx->sample_rate = sample_rate; - codec_ctx->time_base = av_inv_q(av_d2q(sample_rate, 1 << 24)); - codec_ctx->channels = av_get_channel_layout_nb_channels(channel_layout); - codec_ctx->channel_layout = channel_layout; - - // Set optional stuff - if (codec_config) { - auto& cfg = codec_config.value(); - if (cfg.bit_rate > 0) { - codec_ctx->bit_rate = cfg.bit_rate; - } - if (cfg.compression_level != -1) { - codec_ctx->compression_level = cfg.compression_level; - } - if (cfg.qscale) { - codec_ctx->flags |= AV_CODEC_FLAG_QSCALE; - codec_ctx->global_quality = FF_QP2LAMBDA * cfg.qscale.value(); - } - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Video codec -//////////////////////////////////////////////////////////////////////////////// - -bool supported_pix_fmt(const AVPixelFormat fmt, const AVPixelFormat* pix_fmts) { - if (!pix_fmts) { - return true; - } - while (*pix_fmts != AV_PIX_FMT_NONE) { - if (fmt == *pix_fmts) { - return true; - } - ++pix_fmts; - } - return false; -} - -std::string get_supported_formats(const AVPixelFormat* pix_fmts) { - std::vector ret; - while (*pix_fmts != AV_PIX_FMT_NONE) { - ret.emplace_back(av_get_pix_fmt_name(*pix_fmts)); - ++pix_fmts; - } - return c10::Join(", ", ret); -} - -AVPixelFormat get_enc_fmt( - AVPixelFormat src_fmt, - const std::optional& encoder_format, - const AVCodec* codec) { - if (encoder_format) { - const auto& val = encoder_format.value(); - auto fmt = av_get_pix_fmt(val.c_str()); - TORCH_CHECK( - supported_pix_fmt(fmt, codec->pix_fmts), - codec->name, - " does not support ", - val, - " format. Supported values are; ", - get_supported_formats(codec->pix_fmts)); - return fmt; - } - if (codec->pix_fmts) { - return codec->pix_fmts[0]; - } - return src_fmt; -} - -bool supported_frame_rate(AVRational rate, const AVRational* rates) { - if (!rates) { - return true; - } - for (; !(rates->num == 0 && rates->den == 0); ++rates) { - if (av_cmp_q(rate, *rates) == 0) { - return true; - } - } - return false; -} - -AVRational get_enc_rate( - AVRational src_rate, - const std::optional& encoder_sample_rate, - const AVCodec* codec) { - if (encoder_sample_rate) { - const double& enc_rate = encoder_sample_rate.value(); - TORCH_CHECK( - std::isfinite(enc_rate) && enc_rate > 0, - "Encoder sample rate must be positive and fininte. Found: ", - enc_rate); - AVRational rate = av_d2q(enc_rate, 1 << 24); - TORCH_CHECK( - supported_frame_rate(rate, codec->supported_framerates), - codec->name, - " does not support frame rate: ", - enc_rate, - ". Supported values are; ", - [&]() { - std::vector ret; - for (auto r = codec->supported_framerates; - !(r->num == 0 && r->den == 0); - ++r) { - ret.push_back(c10::Join("/", std::array{r->num, r->den})); - } - return c10::Join(", ", ret); - }()); - return rate; - } - if (codec->supported_framerates && - !supported_frame_rate(src_rate, codec->supported_framerates)) { - return codec->supported_framerates[0]; - } - return src_rate; -} - -void configure_video_codec_ctx( - AVCodecContextPtr& ctx, - AVPixelFormat format, - AVRational frame_rate, - int width, - int height, - const std::optional& codec_config) { - // TODO: Review other options and make them configurable? - // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00147 - // - bit_rate_tolerance - // - mb_decisions - - ctx->pix_fmt = format; - ctx->width = width; - ctx->height = height; - ctx->time_base = av_inv_q(frame_rate); - - // Set optional stuff - if (codec_config) { - auto& cfg = codec_config.value(); - if (cfg.bit_rate > 0) { - ctx->bit_rate = cfg.bit_rate; - } - if (cfg.compression_level != -1) { - ctx->compression_level = cfg.compression_level; - } - if (cfg.gop_size != -1) { - ctx->gop_size = cfg.gop_size; - } - if (cfg.max_b_frames != -1) { - ctx->max_b_frames = cfg.max_b_frames; - } - if (cfg.qscale) { - ctx->flags |= AV_CODEC_FLAG_QSCALE; - ctx->global_quality = FF_QP2LAMBDA * cfg.qscale.value(); - } - } -} - -#ifdef USE_CUDA -void configure_hw_accel(AVCodecContext* ctx, const std::string& hw_accel) { - torch::Device device{hw_accel}; - TORCH_CHECK( - device.is_cuda(), - "Only CUDA is supported for hardware acceleration. Found: ", - device); - - // NOTES: - // 1. Examples like - // https://ffmpeg.org/doxygen/4.1/hw_decode_8c-example.html#a9 wraps the HW - // device context and the HW frames context with av_buffer_ref. This - // increments the reference counting and the resource won't be automatically - // dallocated at the time AVCodecContex is destructed. (We will need to - // decrement once ourselves), so we do not do it. When adding support to share - // context objects, this needs to be reviewed. - // - // 2. When encoding, it is technically not necessary to attach HW device - // context to AVCodecContext. But this way, it will be deallocated - // automatically at the time AVCodecContext is freed, so we do that. - - ctx->hw_device_ctx = av_buffer_ref(get_cuda_context(device.index())); - TORCH_INTERNAL_ASSERT( - ctx->hw_device_ctx, "Failed to reference HW device context."); - - ctx->sw_pix_fmt = ctx->pix_fmt; - ctx->pix_fmt = AV_PIX_FMT_CUDA; - - ctx->hw_frames_ctx = av_hwframe_ctx_alloc(ctx->hw_device_ctx); - TORCH_CHECK(ctx->hw_frames_ctx, "Failed to create CUDA frame context."); - - auto frames_ctx = (AVHWFramesContext*)(ctx->hw_frames_ctx->data); - frames_ctx->format = ctx->pix_fmt; - frames_ctx->sw_format = ctx->sw_pix_fmt; - frames_ctx->width = ctx->width; - frames_ctx->height = ctx->height; - frames_ctx->initial_pool_size = 5; - - int ret = av_hwframe_ctx_init(ctx->hw_frames_ctx); - TORCH_CHECK( - ret >= 0, - "Failed to initialize CUDA frame context: ", - av_err2string(ret)); -} -#endif // USE_CUDA - -//////////////////////////////////////////////////////////////////////////////// -// AVStream -//////////////////////////////////////////////////////////////////////////////// - -AVStream* get_stream(AVFormatContext* format_ctx, AVCodecContext* codec_ctx) { - AVStream* stream = avformat_new_stream(format_ctx, nullptr); - TORCH_CHECK(stream, "Failed to allocate stream."); - - stream->time_base = codec_ctx->time_base; - int ret = avcodec_parameters_from_context(stream->codecpar, codec_ctx); - TORCH_CHECK( - ret >= 0, "Failed to copy the stream parameter: ", av_err2string(ret)); - return stream; -} - -//////////////////////////////////////////////////////////////////////////////// -// FilterGraph -//////////////////////////////////////////////////////////////////////////////// - -FilterGraph get_audio_filter_graph( - AVSampleFormat src_fmt, - int src_sample_rate, - uint64_t src_ch_layout, - const std::optional& filter_desc, - AVSampleFormat enc_fmt, - int enc_sample_rate, - uint64_t enc_ch_layout, - int nb_samples) { - const auto desc = [&]() -> const std::string { - std::vector parts; - if (filter_desc) { - parts.push_back(filter_desc.value()); - } - if (filter_desc || src_fmt != enc_fmt || - src_sample_rate != enc_sample_rate || src_ch_layout != enc_ch_layout) { - std::stringstream ss; - ss << "aformat=sample_fmts=" << av_get_sample_fmt_name(enc_fmt) - << ":sample_rates=" << enc_sample_rate << ":channel_layouts=0x" - << std::hex << enc_ch_layout; - parts.push_back(ss.str()); - } - if (nb_samples > 0) { - std::stringstream ss; - ss << "asetnsamples=n=" << nb_samples << ":p=0"; - parts.push_back(ss.str()); - } - if (parts.size()) { - return c10::Join(",", parts); - } - return "anull"; - }(); - - FilterGraph f; - f.add_audio_src( - src_fmt, {1, src_sample_rate}, src_sample_rate, src_ch_layout); - f.add_audio_sink(); - f.add_process(desc); - f.create_filter(); - return f; -} - -FilterGraph get_video_filter_graph( - AVPixelFormat src_fmt, - AVRational src_rate, - int src_width, - int src_height, - const std::optional& filter_desc, - AVPixelFormat enc_fmt, - AVRational enc_rate, - int enc_width, - int enc_height, - bool is_cuda) { - const auto desc = [&]() -> const std::string { - if (is_cuda) { - return filter_desc.value_or("null"); - } - std::vector parts; - if (filter_desc) { - parts.push_back(filter_desc.value()); - } - if (filter_desc || (src_width != enc_width || src_height != enc_height)) { - std::stringstream ss; - ss << "scale=" << enc_width << ":" << enc_height; - parts.emplace_back(ss.str()); - } - if (filter_desc || src_fmt != enc_fmt) { - std::stringstream ss; - ss << "format=" << av_get_pix_fmt_name(enc_fmt); - parts.emplace_back(ss.str()); - } - if (filter_desc || - (src_rate.num != enc_rate.num || src_rate.den != enc_rate.den)) { - std::stringstream ss; - ss << "fps=" << enc_rate.num << "/" << enc_rate.den; - parts.emplace_back(ss.str()); - } - if (parts.size()) { - return c10::Join(",", parts); - } - return "null"; - }(); - - FilterGraph f; - f.add_video_src( - is_cuda ? AV_PIX_FMT_CUDA : src_fmt, - av_inv_q(src_rate), - src_rate, - src_width, - src_height, - {1, 1}); - f.add_video_sink(); - f.add_process(desc); - f.create_filter(); - return f; -} - -//////////////////////////////////////////////////////////////////////////////// -// Source frame -//////////////////////////////////////////////////////////////////////////////// - -AVFramePtr get_audio_frame( - AVSampleFormat format, - int sample_rate, - int num_channels, - uint64_t channel_layout, - int nb_samples) { - AVFramePtr frame{alloc_avframe()}; - frame->format = format; - frame->channel_layout = channel_layout; - frame->sample_rate = sample_rate; - frame->nb_samples = nb_samples; - int ret = av_frame_get_buffer(frame, 0); - TORCH_CHECK( - ret >= 0, "Error allocating the source audio frame:", av_err2string(ret)); - - // Note: `channels` attribute is not required for encoding, but - // TensorConverter refers to it - frame->channels = num_channels; - frame->pts = 0; - return frame; -} - -AVFramePtr get_video_frame(AVPixelFormat src_fmt, int width, int height) { - AVFramePtr frame{alloc_avframe()}; - frame->format = src_fmt; - frame->width = width; - frame->height = height; - int ret = av_frame_get_buffer(frame, 0); - TORCH_CHECK( - ret >= 0, "Error allocating a video buffer :", av_err2string(ret)); - - // Note: `nb_samples` attribute is not used for video, but we set it - // anyways so that we can make the logic of PTS increment agnostic to - // audio and video. - frame->nb_samples = 1; - frame->pts = 0; - return frame; -} - -} // namespace - -//////////////////////////////////////////////////////////////////////////////// -// Finally, the extern-facing API -//////////////////////////////////////////////////////////////////////////////// - -EncodeProcess get_audio_encode_process( - AVFormatContext* format_ctx, - int src_sample_rate, - int src_num_channels, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_sample_rate, - const std::optional& encoder_num_channels, - const std::optional& codec_config, - const std::optional& filter_desc, - bool disable_converter) { - // 1. Check the source format, rate and channels - TORCH_CHECK( - src_sample_rate > 0, - "Sample rate must be positive. Found: ", - src_sample_rate); - TORCH_CHECK( - src_num_channels > 0, - "The number of channels must be positive. Found: ", - src_num_channels); - // Note that disable_converter = true indicates that the caller is looking to - // directly supply frames and bypass tensor conversion. Therefore, in this - // case, restrictions on the format to support tensor inputs do not apply, and - // so we directly get the format via FFmpeg. - const AVSampleFormat src_fmt = (disable_converter) - ? av_get_sample_fmt(format.c_str()) - : get_src_sample_fmt(format); - const auto src_ch_layout = - static_cast(av_get_default_channel_layout(src_num_channels)); - - // 2. Fetch codec from default or override - TORCH_CHECK( - format_ctx->oformat->audio_codec != AV_CODEC_ID_NONE, - format_ctx->oformat->name, - " does not support audio."); - const AVCodec* codec = get_codec(format_ctx->oformat->audio_codec, encoder); - - // 3. Check that encoding sample format, sample rate and channels - const AVSampleFormat enc_fmt = get_enc_fmt(src_fmt, encoder_format, codec); - const int enc_sr = get_enc_sr(src_sample_rate, encoder_sample_rate, codec); - const uint64_t enc_ch_layout = [&]() -> uint64_t { - if (std::strcmp(codec->name, "vorbis") == 0) { - // Special case for vorbis. - // It only supports 2 channels, but it is not listed in channel_layouts - // attributes. - // https://github.com/FFmpeg/FFmpeg/blob/0684e58886881a998f1a7b510d73600ff1df2b90/libavcodec/vorbisenc.c#L1277 - // This is the case for at least until FFmpeg 6.0, so it will be - // like this for a while. - return static_cast(av_get_default_channel_layout(2)); - } - return get_channel_layout(src_ch_layout, encoder_num_channels, codec); - }(); - - // 4. Initialize codec context - AVCodecContextPtr codec_ctx = - get_codec_ctx(codec, format_ctx->oformat->flags); - configure_audio_codec_ctx( - codec_ctx, enc_fmt, enc_sr, enc_ch_layout, codec_config); - open_codec(codec_ctx, encoder_option); - - // 5. Build filter graph - FilterGraph filter_graph = get_audio_filter_graph( - src_fmt, - src_sample_rate, - src_ch_layout, - filter_desc, - enc_fmt, - enc_sr, - enc_ch_layout, - codec_ctx->frame_size); - - // 6. Instantiate source frame - AVFramePtr src_frame = get_audio_frame( - src_fmt, - src_sample_rate, - src_num_channels, - src_ch_layout, - codec_ctx->frame_size > 0 ? codec_ctx->frame_size : 256); - - // 7. Instantiate Converter - TensorConverter converter{ - (disable_converter) ? AVMEDIA_TYPE_UNKNOWN : AVMEDIA_TYPE_AUDIO, - src_frame, - src_frame->nb_samples}; - - // 8. encoder - // Note: get_stream modifies AVFormatContext and adds new stream. - // If anything after this throws, it will leave the StreamingMediaEncoder in - // an invalid state. - Encoder enc{format_ctx, codec_ctx, get_stream(format_ctx, codec_ctx)}; - - return EncodeProcess{ - std::move(converter), - std::move(src_frame), - std::move(filter_graph), - std::move(enc), - std::move(codec_ctx)}; -} - -namespace { - -bool ends_with(std::string_view str, std::string_view suffix) { - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -} // namespace - -EncodeProcess get_video_encode_process( - AVFormatContext* format_ctx, - double frame_rate, - int src_width, - int src_height, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_frame_rate, - const std::optional& encoder_width, - const std::optional& encoder_height, - const std::optional& hw_accel, - const std::optional& codec_config, - const std::optional& filter_desc, - bool disable_converter) { - // 1. Checkc the source format, rate and resolution - TORCH_CHECK( - std::isfinite(frame_rate) && frame_rate > 0, - "Frame rate must be positive and finite. Found: ", - frame_rate); - TORCH_CHECK(src_width > 0, "width must be positive. Found: ", src_width); - TORCH_CHECK(src_height > 0, "height must be positive. Found: ", src_height); - // Note that disable_converter = true indicates that the caller is looking to - // directly supply frames and bypass tensor conversion. Therefore, in this - // case, restrictions on the format to support tensor inputs do not apply, and - // so we directly get the format via FFmpeg. - const AVPixelFormat src_fmt = (disable_converter) - ? av_get_pix_fmt(format.c_str()) - : get_src_pix_fmt(format); - const AVRational src_rate = av_d2q(frame_rate, 1 << 24); - - // 2. Fetch codec from default or override - TORCH_CHECK( - format_ctx->oformat->video_codec != AV_CODEC_ID_NONE, - format_ctx->oformat->name, - " does not support video."); - const AVCodec* codec = get_codec(format_ctx->oformat->video_codec, encoder); - - // 3. Check that encoding format, rate - const AVPixelFormat enc_fmt = get_enc_fmt(src_fmt, encoder_format, codec); - const AVRational enc_rate = get_enc_rate(src_rate, encoder_frame_rate, codec); - const int enc_width = [&]() -> int { - if (!encoder_width) { - return src_width; - } - const int& val = encoder_width.value(); - TORCH_CHECK(val > 0, "Encoder width must be positive. Found: ", val); - return val; - }(); - const int enc_height = [&]() -> int { - if (!encoder_height) { - return src_height; - } - const int& val = encoder_height.value(); - TORCH_CHECK(val > 0, "Encoder height must be positive. Found: ", val); - return val; - }(); - - // 4. Initialize codec context - AVCodecContextPtr codec_ctx = - get_codec_ctx(codec, format_ctx->oformat->flags); - configure_video_codec_ctx( - codec_ctx, enc_fmt, enc_rate, enc_width, enc_height, codec_config); - if (hw_accel) { -#ifdef USE_CUDA - configure_hw_accel(codec_ctx, hw_accel.value()); -#else - TORCH_CHECK( - false, - "torchaudio is not compiled with CUDA support. ", - "Hardware acceleration is not available."); -#endif - } - open_codec(codec_ctx, encoder_option); - - if (ends_with(codec_ctx->codec->name, "_nvenc")) { - C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamingMediaDecoderCUDA"); - } - - // 5. Build filter graph - FilterGraph filter_graph = get_video_filter_graph( - src_fmt, - src_rate, - src_width, - src_height, - filter_desc, - enc_fmt, - enc_rate, - enc_width, - enc_height, - hw_accel.has_value()); - - // 6. Instantiate source frame - AVFramePtr src_frame = [&]() { - if (codec_ctx->hw_frames_ctx) { - AVFramePtr frame{alloc_avframe()}; - int ret = av_hwframe_get_buffer(codec_ctx->hw_frames_ctx, frame, 0); - TORCH_CHECK(ret >= 0, "Failed to fetch CUDA frame: ", av_err2string(ret)); - frame->nb_samples = 1; - frame->pts = 0; - return frame; - } - return get_video_frame(src_fmt, src_width, src_height); - }(); - - // 7. Converter - TensorConverter converter{ - (disable_converter) ? AVMEDIA_TYPE_UNKNOWN : AVMEDIA_TYPE_VIDEO, - src_frame}; - - // 8. encoder - // Note: get_stream modifies AVFormatContext and adds new stream. - // If anything after this throws, it will leave the StreamingMediaEncoder in - // an invalid state. - Encoder enc{format_ctx, codec_ctx, get_stream(format_ctx, codec_ctx)}; - - return EncodeProcess{ - std::move(converter), - std::move(src_frame), - std::move(filter_graph), - std::move(enc), - std::move(codec_ctx)}; -} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/encode_process.h b/src/libtorio/ffmpeg/stream_writer/encode_process.h deleted file mode 100644 index 4c8cc9ee9e..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/encode_process.h +++ /dev/null @@ -1,67 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include -#include - -namespace torio::io { - -class EncodeProcess { - TensorConverter converter; - AVFramePtr src_frame; - FilterGraph filter; - AVFramePtr dst_frame{alloc_avframe()}; - Encoder encoder; - AVCodecContextPtr codec_ctx; - - public: - EncodeProcess( - TensorConverter&& converter, - AVFramePtr&& frame, - FilterGraph&& filter_graph, - Encoder&& encoder, - AVCodecContextPtr&& codec_ctx) noexcept; - - EncodeProcess(EncodeProcess&&) noexcept = default; - - void process(const torch::Tensor& tensor, const std::optional& pts); - - void process_frame(AVFrame* src); - - void flush(); -}; - -EncodeProcess get_audio_encode_process( - AVFormatContext* format_ctx, - int sample_rate, - int num_channels, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_sample_rate, - const std::optional& encoder_num_channels, - const std::optional& codec_config, - const std::optional& filter_desc, - bool disable_converter = false); - -EncodeProcess get_video_encode_process( - AVFormatContext* format_ctx, - double frame_rate, - int width, - int height, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_frame_rate, - const std::optional& encoder_width, - const std::optional& encoder_height, - const std::optional& hw_accel, - const std::optional& codec_config, - const std::optional& filter_desc, - bool disable_converter = false); - -}; // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/encoder.cpp b/src/libtorio/ffmpeg/stream_writer/encoder.cpp deleted file mode 100644 index b1cdfa91c3..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/encoder.cpp +++ /dev/null @@ -1,62 +0,0 @@ -#include - -namespace torio::io { - -Encoder::Encoder( - AVFormatContext* format_ctx, - AVCodecContext* codec_ctx, - AVStream* stream) noexcept - : format_ctx(format_ctx), codec_ctx(codec_ctx), stream(stream) {} - -/// -/// Encode the given AVFrame data -/// -/// @param frame Frame data to encode -void Encoder::encode(AVFrame* frame) { - int ret = avcodec_send_frame(codec_ctx, frame); - TORCH_CHECK(ret >= 0, "Failed to encode frame (", av_err2string(ret), ")."); - while (ret >= 0) { - ret = avcodec_receive_packet(codec_ctx, packet); - if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { - if (ret == AVERROR_EOF) { - // Note: - // av_interleaved_write_frame buffers the packets internally as needed - // to make sure the packets in the output file are properly interleaved - // in the order of increasing dts. - // https://ffmpeg.org/doxygen/3.4/group__lavf__encoding.html#ga37352ed2c63493c38219d935e71db6c1 - // Passing nullptr will (forcefully) flush the queue, and this is - // necessary if users mal-configure the streams. - - // Possible follow up: Add flush_buffer method? - // An alternative is to use `av_write_frame` functoin, but in that case - // client code is responsible for ordering packets, which makes it - // complicated to use StreamingMediaEncoder - ret = av_interleaved_write_frame(format_ctx, nullptr); - TORCH_CHECK( - ret >= 0, "Failed to flush packet (", av_err2string(ret), ")."); - } - break; - } else { - TORCH_CHECK( - ret >= 0, - "Failed to fetch encoded packet (", - av_err2string(ret), - ")."); - } - // https://github.com/pytorch/audio/issues/2790 - // If this is not set, the last frame is not properly saved, as - // the encoder cannot figure out when the packet should finish. - if (packet->duration == 0 && codec_ctx->codec_type == AVMEDIA_TYPE_VIDEO) { - // 1 means that 1 frame (in codec time base, which is the frame rate) - // This has to be set before av_packet_rescale_ts bellow. - packet->duration = 1; - } - av_packet_rescale_ts(packet, codec_ctx->time_base, stream->time_base); - packet->stream_index = stream->index; - - ret = av_interleaved_write_frame(format_ctx, packet); - TORCH_CHECK(ret >= 0, "Failed to write packet (", av_err2string(ret), ")."); - } -} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/encoder.h b/src/libtorio/ffmpeg/stream_writer/encoder.h deleted file mode 100644 index 3ced3c1644..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/encoder.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace torio::io { - -// Encoder + Muxer -class Encoder { - // Reference to the AVFormatContext (muxer) - AVFormatContext* format_ctx; - // Reference to codec context (encoder) - AVCodecContext* codec_ctx; - // Stream object as reference. Owned by AVFormatContext. - AVStream* stream; - // Temporary object used during the encoding - // Encoder owns it. - AVPacketPtr packet{alloc_avpacket()}; - - public: - Encoder( - AVFormatContext* format_ctx, - AVCodecContext* codec_ctx, - AVStream* stream) noexcept; - - void encode(AVFrame* frame); -}; - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/packet_writer.cpp b/src/libtorio/ffmpeg/stream_writer/packet_writer.cpp deleted file mode 100644 index 2b8091b0a2..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/packet_writer.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include - -namespace torio::io { -namespace { -AVStream* add_stream( - AVFormatContext* format_ctx, - const StreamParams& stream_params) { - AVStream* stream = avformat_new_stream(format_ctx, nullptr); - int ret = - avcodec_parameters_copy(stream->codecpar, stream_params.codec_params); - TORCH_CHECK( - ret >= 0, - "Failed to copy the stream's codec parameters. (", - av_err2string(ret), - ")"); - stream->time_base = stream_params.time_base; - return stream; -} -} // namespace -PacketWriter::PacketWriter( - AVFormatContext* format_ctx_, - const StreamParams& stream_params_) - : format_ctx(format_ctx_), - stream(add_stream(format_ctx_, stream_params_)), - original_time_base(stream_params_.time_base) {} - -void PacketWriter::write_packet(const AVPacketPtr& packet) { - AVPacket dst_packet; - int ret = av_packet_ref(&dst_packet, packet); - TORCH_CHECK(ret >= 0, "Failed to copy packet."); - av_packet_rescale_ts(&dst_packet, original_time_base, stream->time_base); - dst_packet.stream_index = stream->index; - ret = av_interleaved_write_frame(format_ctx, &dst_packet); - TORCH_CHECK(ret >= 0, "Failed to write packet to destination."); -} -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/packet_writer.h b/src/libtorio/ffmpeg/stream_writer/packet_writer.h deleted file mode 100644 index a8d65533c2..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/packet_writer.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once -#include - -namespace torio::io { -class PacketWriter { - AVFormatContext* format_ctx; - AVStream* stream; - AVRational original_time_base; - - public: - PacketWriter( - AVFormatContext* format_ctx_, - const StreamParams& stream_params_); - void write_packet(const AVPacketPtr& packet); -}; -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/stream_writer.cpp b/src/libtorio/ffmpeg/stream_writer/stream_writer.cpp deleted file mode 100644 index 95eff14753..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/stream_writer.cpp +++ /dev/null @@ -1,390 +0,0 @@ -#include - -#ifdef USE_CUDA -#include -#endif - -namespace torio { -namespace io { -namespace { - -AVFormatContext* get_output_format_context( - const std::string& dst, - const std::optional& format, - AVIOContext* io_ctx) { - if (io_ctx) { - TORCH_CHECK( - format, - "`format` must be provided when the input is file-like object."); - } - - AVFormatContext* p = nullptr; - int ret = avformat_alloc_output_context2( - &p, nullptr, format ? format.value().c_str() : nullptr, dst.c_str()); - TORCH_CHECK( - ret >= 0, - "Failed to open output \"", - dst, - "\" (", - av_err2string(ret), - ")."); - - if (io_ctx) { - p->pb = io_ctx; - p->flags |= AVFMT_FLAG_CUSTOM_IO; - } - - return p; -} -} // namespace - -StreamingMediaEncoder::StreamingMediaEncoder(AVFormatContext* p) - : format_ctx(p) { - C10_LOG_API_USAGE_ONCE("torchaudio.io.StreamingMediaEncoder"); -} - -StreamingMediaEncoder::StreamingMediaEncoder( - AVIOContext* io_ctx, - const std::optional& format) - : StreamingMediaEncoder( - get_output_format_context("Custom Output Context", format, io_ctx)) {} - -StreamingMediaEncoder::StreamingMediaEncoder( - const std::string& dst, - const std::optional& format) - : StreamingMediaEncoder(get_output_format_context(dst, format, nullptr)) {} - -void StreamingMediaEncoder::add_audio_stream( - int sample_rate, - int num_channels, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_sample_rate, - const std::optional& encoder_num_channels, - const std::optional& codec_config, - const std::optional& filter_desc) { - TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream."); - TORCH_INTERNAL_ASSERT( - format_ctx->nb_streams == num_output_streams(), - "The number of encode process and the number of output streams do not match."); - processes.emplace( - std::piecewise_construct, - std::forward_as_tuple(current_key), - std::forward_as_tuple(get_audio_encode_process( - format_ctx, - sample_rate, - num_channels, - format, - encoder, - encoder_option, - encoder_format, - encoder_sample_rate, - encoder_num_channels, - codec_config, - filter_desc))); - current_key++; -} - -void StreamingMediaEncoder::add_video_stream( - double frame_rate, - int width, - int height, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_frame_rate, - const std::optional& encoder_width, - const std::optional& encoder_height, - const std::optional& hw_accel, - const std::optional& codec_config, - const std::optional& filter_desc) { - TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream."); - TORCH_INTERNAL_ASSERT( - format_ctx->nb_streams == num_output_streams(), - "The number of encode process and the number of output streams do not match."); - processes.emplace( - std::piecewise_construct, - std::forward_as_tuple(current_key), - std::forward_as_tuple(get_video_encode_process( - format_ctx, - frame_rate, - width, - height, - format, - encoder, - encoder_option, - encoder_format, - encoder_frame_rate, - encoder_width, - encoder_height, - hw_accel, - codec_config, - filter_desc))); - current_key++; -} - -void StreamingMediaEncoder::add_packet_stream( - const StreamParams& stream_params) { - packet_writers.emplace( - std::piecewise_construct, - std::forward_as_tuple(stream_params.stream_index), - std::forward_as_tuple(format_ctx, stream_params)); - current_key++; -} - -void StreamingMediaEncoder::add_audio_frame_stream( - int sample_rate, - int num_channels, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_sample_rate, - const std::optional& encoder_num_channels, - const std::optional& codec_config, - const std::optional& filter_desc) { - TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream."); - TORCH_INTERNAL_ASSERT( - format_ctx->nb_streams == num_output_streams(), - "The number of encode process and the number of output streams do not match."); - processes.emplace( - std::piecewise_construct, - std::forward_as_tuple(current_key), - std::forward_as_tuple(get_audio_encode_process( - format_ctx, - sample_rate, - num_channels, - format, - encoder, - encoder_option, - encoder_format, - encoder_sample_rate, - encoder_num_channels, - codec_config, - filter_desc, - true))); - current_key++; -} - -void StreamingMediaEncoder::add_video_frame_stream( - double frame_rate, - int width, - int height, - const std::string& format, - const std::optional& encoder, - const std::optional& encoder_option, - const std::optional& encoder_format, - const std::optional& encoder_frame_rate, - const std::optional& encoder_width, - const std::optional& encoder_height, - const std::optional& hw_accel, - const std::optional& codec_config, - const std::optional& filter_desc) { - TORCH_CHECK(!is_open, "Output is already opened. Cannot add a new stream."); - TORCH_INTERNAL_ASSERT( - format_ctx->nb_streams == num_output_streams(), - "The number of encode process and the number of output streams do not match."); - processes.emplace( - std::piecewise_construct, - std::forward_as_tuple(current_key), - std::forward_as_tuple(get_video_encode_process( - format_ctx, - frame_rate, - width, - height, - format, - encoder, - encoder_option, - encoder_format, - encoder_frame_rate, - encoder_width, - encoder_height, - hw_accel, - codec_config, - filter_desc, - true))); - current_key++; -} - -void StreamingMediaEncoder::set_metadata(const OptionDict& metadata) { - av_dict_free(&format_ctx->metadata); - for (auto const& [key, value] : metadata) { - av_dict_set(&format_ctx->metadata, key.c_str(), value.c_str(), 0); - } -} - -void StreamingMediaEncoder::dump_format(int64_t i) { - av_dump_format(format_ctx, (int)i, format_ctx->url, 1); -} - -void StreamingMediaEncoder::open(const std::optional& option) { - TORCH_INTERNAL_ASSERT( - format_ctx->nb_streams == num_output_streams(), - "The number of encode process and the number of output streams do not match."); - - int ret = 0; - - // Open the file if it was not provided by client code (i.e. when not - // file-like object) - AVFORMAT_CONST AVOutputFormat* fmt = format_ctx->oformat; - AVDictionary* opt = get_option_dict(option); - if (!(fmt->flags & AVFMT_NOFILE) && - !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { - ret = avio_open2( - &format_ctx->pb, format_ctx->url, AVIO_FLAG_WRITE, nullptr, &opt); - if (ret < 0) { - av_dict_free(&opt); - TORCH_CHECK( - false, - "Failed to open dst: ", - format_ctx->url, - " (", - av_err2string(ret), - ")"); - } - } - - ret = avformat_write_header(format_ctx, &opt); - clean_up_dict(opt); - TORCH_CHECK( - ret >= 0, - "Failed to write header: ", - format_ctx->url, - " (", - av_err2string(ret), - ")"); - is_open = true; -} - -void StreamingMediaEncoder::close() { - int ret = av_write_trailer(format_ctx); - if (ret < 0) { - LOG(WARNING) << "Failed to write trailer. (" << av_err2string(ret) << ")."; - } - - // Close the file if it was not provided by client code (i.e. when not - // file-like object) - AVFORMAT_CONST AVOutputFormat* fmt = format_ctx->oformat; - if (!(fmt->flags & AVFMT_NOFILE) && - !(format_ctx->flags & AVFMT_FLAG_CUSTOM_IO)) { - // avio_closep can be only applied to AVIOContext opened by avio_open - avio_closep(&(format_ctx->pb)); - } - is_open = false; -} - -void StreamingMediaEncoder::write_audio_chunk( - int i, - const torch::Tensor& waveform, - const std::optional& pts) { - TORCH_CHECK(is_open, "Output is not opened. Did you call `open` method?"); - TORCH_CHECK( - 0 <= i && i < static_cast(format_ctx->nb_streams), - "Invalid stream index. Index must be in range of [0, ", - format_ctx->nb_streams, - "). Found: ", - i); - TORCH_CHECK( - format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_AUDIO, - "Stream ", - i, - " is not audio type."); - processes.at(i).process(waveform, pts); -} - -void StreamingMediaEncoder::write_video_chunk( - int i, - const torch::Tensor& frames, - const std::optional& pts) { - TORCH_CHECK(is_open, "Output is not opened. Did you call `open` method?"); - TORCH_CHECK( - 0 <= i && i < static_cast(format_ctx->nb_streams), - "Invalid stream index. Index must be in range of [0, ", - format_ctx->nb_streams, - "). Found: ", - i); - TORCH_CHECK( - format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO, - "Stream ", - i, - " is not video type."); - processes.at(i).process(frames, pts); -} - -void StreamingMediaEncoder::write_packet(const AVPacketPtr& packet) { - TORCH_CHECK(is_open, "Output is not opened. Did you call `open` method?"); - int src_stream_index = packet->stream_index; - TORCH_CHECK( - packet_writers.count(src_stream_index), - "Invalid packet stream source index ", - src_stream_index); - packet_writers.at(src_stream_index).write_packet(packet); -} - -void StreamingMediaEncoder::write_frame(int i, AVFrame* frame) { - TORCH_CHECK(is_open, "Output is not opened. Did you call `open` method?"); - TORCH_CHECK( - 0 <= i && i < static_cast(format_ctx->nb_streams), - "Invalid stream index. Index must be in range of [0, ", - format_ctx->nb_streams, - "). Found: ", - i); - processes.at(i).process_frame(frame); -} - -void StreamingMediaEncoder::flush() { - TORCH_CHECK(is_open, "Output is not opened. Did you call `open` method?"); - for (auto& p : processes) { - p.second.flush(); - } -} - -int StreamingMediaEncoder::num_output_streams() { - return static_cast(processes.size() + packet_writers.size()); -} - -//////////////////////////////////////////////////////////////////////////////// -// StreamingMediaEncoderCustomIO -//////////////////////////////////////////////////////////////////////////////// - -namespace detail { -namespace { -AVIOContext* get_io_context( - void* opaque, - int buffer_size, - int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)) { - unsigned char* buffer = static_cast(av_malloc(buffer_size)); - TORCH_CHECK(buffer, "Failed to allocate buffer."); - AVIOContext* io_ctx = avio_alloc_context( - buffer, buffer_size, 1, opaque, nullptr, write_packet, seek); - if (!io_ctx) { - av_freep(&buffer); - TORCH_CHECK(false, "Failed to allocate AVIOContext."); - } - return io_ctx; -} -} // namespace - -CustomOutput::CustomOutput( - void* opaque, - int buffer_size, - int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)) - : io_ctx(get_io_context(opaque, buffer_size, write_packet, seek)) {} -} // namespace detail - -StreamingMediaEncoderCustomIO::StreamingMediaEncoderCustomIO( - void* opaque, - const std::optional& format, - int buffer_size, - int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)) - : CustomOutput(opaque, buffer_size, write_packet, seek), - StreamingMediaEncoder(io_ctx, format) {} - -} // namespace io -} // namespace torio diff --git a/src/libtorio/ffmpeg/stream_writer/stream_writer.h b/src/libtorio/ffmpeg/stream_writer/stream_writer.h deleted file mode 100644 index a646d3f38a..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/stream_writer.h +++ /dev/null @@ -1,344 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace torio { -namespace io { - -//////////////////////////////////////////////////////////////////////////////// -// StreamingMediaEncoder -//////////////////////////////////////////////////////////////////////////////// - -/// -/// Encode and write audio/video streams chunk by chunk -/// -class StreamingMediaEncoder { - AVFormatOutputContextPtr format_ctx; - std::map processes; - std::map packet_writers; - - AVPacketPtr pkt{alloc_avpacket()}; - bool is_open = false; - int current_key = 0; - - /// @cond - - private: - explicit StreamingMediaEncoder(AVFormatContext*); - - protected: - /// Construct StreamingMediaEncoder from custom IO - /// - /// @param io_ctx Custom IO. - /// @param format Specify output format. - explicit StreamingMediaEncoder( - AVIOContext* io_ctx, - const std::optional& format = std::nullopt); - - /// @endcond - - public: - /// Construct StreamingMediaEncoder from destination URI - /// - /// @param dst Destination where encoded data are written. - /// @param format Specify output format. If not provided, it is guessed from - /// ``dst``. - explicit StreamingMediaEncoder( - const std::string& dst, - const std::optional& format = std::nullopt); - - // Non-copyable - StreamingMediaEncoder(const StreamingMediaEncoder&) = delete; - StreamingMediaEncoder& operator=(const StreamingMediaEncoder&) = delete; - - ////////////////////////////////////////////////////////////////////////////// - // Query methods - ////////////////////////////////////////////////////////////////////////////// - public: - /// @cond - - /// Print the configured outputs - void dump_format(int64_t i); - - /// @endcond - - ////////////////////////////////////////////////////////////////////////////// - // Configure methods - ////////////////////////////////////////////////////////////////////////////// - public: - /// Add an output audio stream. - /// - /// @param sample_rate The sample rate. - /// @param num_channels The number of channels. - /// @param format Input sample format, which determines the dtype - /// of the input tensor. - /// @parblock - /// - /// - ``"u8"``: The input tensor must be ``torch.uint8`` type. - /// - ``"s16"``: The input tensor must be ``torch.int16`` type. - /// - ``"s32"``: The input tensor must be ``torch.int32`` type. - /// - ``"s64"``: The input tensor must be ``torch.int64`` type. - /// - ``"flt"``: The input tensor must be ``torch.float32`` type. - /// - ``"dbl"``: The input tensor must be ``torch.float64`` type. - /// - /// Default: ``"flt"``. - /// @endparblock - /// @param encoder The name of the encoder to be used. - /// @parblock - /// When provided, use the specified encoder instead of the default one. - /// - /// To list the available encoders, you can use ``ffmpeg -encoders`` command. - /// @endparblock - /// @param encoder_option Options passed to encoder. - /// To list encoder options for a encoder, you can use - /// ``ffmpeg -h encoder=``. - /// @param encoder_format Format used to encode media. - /// When encoder supports multiple formats, passing this argument will - /// override the format used for encoding. - /// To list supported formats for the encoder, you can use - /// ``ffmpeg -h encoder=`` command. - /// @param encoder_sample_rate If provided, perform resampling - /// before encoding. - /// @param encoder_num_channels If provided, change channel configuration - /// before encoding. - /// @param codec_config Codec configuration. - /// @param filter_desc Additional processing to apply before - /// encoding the input data - void add_audio_stream( - int sample_rate, - int num_channels, - const std::string& format, - const std::optional& encoder = std::nullopt, - const std::optional& encoder_option = std::nullopt, - const std::optional& encoder_format = std::nullopt, - const std::optional& encoder_sample_rate = std::nullopt, - const std::optional& encoder_num_channels = std::nullopt, - const std::optional& codec_config = std::nullopt, - const std::optional& filter_desc = std::nullopt); - - /// Add an output video stream. - /// - /// @param frame_rate Frame rate - /// @param width Width - /// @param height Height - /// @param format Input pixel format, which determines the - /// color channel order of the input tensor. - /// @parblock - /// - /// - ``"gray8"``: One channel, grayscale. - /// - ``"rgb24"``: Three channels in the order of RGB. - /// - ``"bgr24"``: Three channels in the order of BGR. - /// - ``"yuv444p"``: Three channels in the order of YUV. - /// - /// In either case, the input tensor has to be ``torch.uint8`` type and - /// the shape must be (frame, channel, height, width). - /// @endparblock - /// @param encoder See ``add_audio_stream()``. - /// @param encoder_option See ``add_audio_stream()``. - /// @param encoder_format See ``add_audio_stream()``. - /// @param encoder_frame_rate If provided, change frame rate before encoding. - /// @param encoder_width If provided, resize image before encoding. - /// @param encoder_height If provided, resize image before encoding. - /// @param hw_accel Enable hardware acceleration. - /// @param codec_config Codec configuration. - /// @parblock - /// When video is encoded on CUDA hardware, for example - /// `encoder="h264_nvenc"`, passing CUDA device indicator to `hw_accel` - /// (i.e. `hw_accel="cuda:0"`) will make StreamingMediaEncoder expect video - /// chunk to be a CUDA Tensor. Passing CPU Tensor will result in an error. - /// - /// If `None`, the video chunk Tensor has to be a CPU Tensor. - /// @endparblock - /// @param filter_desc Additional processing to apply before - /// encoding the input data - void add_video_stream( - double frame_rate, - int width, - int height, - const std::string& format, - const std::optional& encoder = std::nullopt, - const std::optional& encoder_option = std::nullopt, - const std::optional& encoder_format = std::nullopt, - const std::optional& encoder_frame_rate = std::nullopt, - const std::optional& encoder_width = std::nullopt, - const std::optional& encoder_height = std::nullopt, - const std::optional& hw_accel = std::nullopt, - const std::optional& codec_config = std::nullopt, - const std::optional& filter_desc = std::nullopt); - /// @cond - /// Add output audio frame stream. - /// Allows for writing frames rather than tensors via `write_frame`. - /// - /// See `add_audio_stream` for more detail on input parameters. - void add_audio_frame_stream( - int sample_rate, - int num_channels, - const std::string& format, - const std::optional& encoder = std::nullopt, - const std::optional& encoder_option = std::nullopt, - const std::optional& encoder_format = std::nullopt, - const std::optional& encoder_sample_rate = std::nullopt, - const std::optional& encoder_num_channels = std::nullopt, - const std::optional& codec_config = std::nullopt, - const std::optional& filter_desc = std::nullopt); - - /// Add output video frame stream. - /// Allows for writing frames rather than tensors via `write_frame`. - /// - /// See `add_video_stream` for more detail on input parameters. - void add_video_frame_stream( - double frame_rate, - int width, - int height, - const std::string& format, - const std::optional& encoder = std::nullopt, - const std::optional& encoder_option = std::nullopt, - const std::optional& encoder_format = std::nullopt, - const std::optional& encoder_frame_rate = std::nullopt, - const std::optional& encoder_width = std::nullopt, - const std::optional& encoder_height = std::nullopt, - const std::optional& hw_accel = std::nullopt, - const std::optional& codec_config = std::nullopt, - const std::optional& filter_desc = std::nullopt); - - /// Add packet stream. Intended to be used in conjunction with - /// ``StreamingMediaDecoder`` to perform packet passthrough. - /// @param stream_params Stream parameters returned by - /// ``StreamingMediaDecoder::get_src_stream_params()`` for the packet stream - /// to pass through. - void add_packet_stream(const StreamParams& stream_params); - - /// @endcond - - /// Set file-level metadata - /// @param metadata metadata. - void set_metadata(const OptionDict& metadata); - - ////////////////////////////////////////////////////////////////////////////// - // Write methods - ////////////////////////////////////////////////////////////////////////////// - public: - /// Open the output file / device and write the header. - /// - /// @param opt Private options for protocol, device and muxer. - void open(const std::optional& opt = std::nullopt); - /// Close the output file / device and finalize metadata. - void close(); - - /// Write audio data - /// @param i Stream index. - /// @param frames Waveform tensor. Shape: ``(frame, channel)``. - /// The ``dtype`` must match what was passed to ``add_audio_stream()`` method. - /// @param pts - /// @parblock - /// Presentation timestamp. If provided, it overwrites the PTS of - /// the first frame with the provided one. Otherwise, PTS are incremented per - /// an inverse of sample rate. Only values exceed the PTS values processed - /// internally. - /// - /// __NOTE__: The provided value is converted to integer value expressed - /// in basis of sample rate. - /// Therefore, it is truncated to the nearest value of ``n / sample_rate``. - /// @endparblock - void write_audio_chunk( - int i, - const torch::Tensor& frames, - const std::optional& pts = std::nullopt); - /// Write video data - /// @param i Stream index. - /// @param frames Video/image tensor. Shape: ``(time, channel, height, - /// width)``. The ``dtype`` must be ``torch.uint8``. The shape ``(height, - /// width and the number of channels)`` must match what was configured when - /// calling ``add_video_stream()``. - /// @param pts - /// @parblock - /// Presentation timestamp. If provided, it overwrites the PTS of - /// the first frame with the provided one. Otherwise, PTS are incremented per - /// an inverse of frame rate. Only values exceed the PTS values processed - /// internally. - /// - /// __NOTE__: The provided value is converted to integer value expressed - /// in basis of frame rate. - /// Therefore, it is truncated to the nearest value of ``n / frame_rate``. - /// @endparblock - void write_video_chunk( - int i, - const torch::Tensor& frames, - const std::optional& pts = std::nullopt); - /// @cond - /// Write frame to stream. - /// @param i Stream index. - /// @param frame Frame to write. - void write_frame(int i, AVFrame* frame); - /// Write packet. - /// @param packet Packet to write, passed from ``StreamingMediaDecoder``. - void write_packet(const AVPacketPtr& packet); - /// @endcond - - /// Flush the frames from encoders and write the frames to the destination. - void flush(); - - private: - int num_output_streams(); -}; - -//////////////////////////////////////////////////////////////////////////////// -// StreamingMediaEncoderCustomIO -//////////////////////////////////////////////////////////////////////////////// - -/// @cond - -namespace detail { -struct CustomOutput { - AVIOContextPtr io_ctx; - CustomOutput( - void* opaque, - int buffer_size, - int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence)); -}; -} // namespace detail - -/// @endcond - -/// -/// A subclass of StreamingMediaDecoder which works with custom read function. -/// Can be used for encoding media into memory or custom object. -/// -class StreamingMediaEncoderCustomIO : private detail::CustomOutput, - public StreamingMediaEncoder { - public: - /// Construct StreamingMediaEncoderCustomIO with custom write and seek - /// functions. - /// - /// @param opaque Custom data used by ``write_packet`` and ``seek`` functions. - /// @param format Specify output format. - /// @param buffer_size The size of the intermediate buffer, which FFmpeg uses - /// to pass data to write_packet function. - /// @param write_packet Custom write function that is called from FFmpeg to - /// actually write data to the custom destination. - /// @param seek Optional seek function that is used to seek the destination. - StreamingMediaEncoderCustomIO( - void* opaque, - const std::optional& format, - int buffer_size, - int (*write_packet)(void* opaque, uint8_t* buf, int buf_size), - int64_t (*seek)(void* opaque, int64_t offset, int whence) = nullptr); -}; - -// For BC -using StreamWriter = StreamingMediaEncoder; -using StreamWriterCustomIO = StreamingMediaEncoderCustomIO; - -} // namespace io -} // namespace torio - -// For BC -namespace torchaudio::io { -using namespace torio::io; -} // namespace torchaudio::io diff --git a/src/libtorio/ffmpeg/stream_writer/tensor_converter.cpp b/src/libtorio/ffmpeg/stream_writer/tensor_converter.cpp deleted file mode 100644 index 097cae170f..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/tensor_converter.cpp +++ /dev/null @@ -1,497 +0,0 @@ -#include - -#ifdef USE_CUDA -#include -#endif - -namespace torio::io { - -namespace { - -using namespace torch::indexing; - -using InitFunc = TensorConverter::InitFunc; -using ConvertFunc = TensorConverter::ConvertFunc; - -//////////////////////////////////////////////////////////////////////////////// -// Audio -//////////////////////////////////////////////////////////////////////////////// - -void validate_audio_input( - const torch::Tensor& t, - AVFrame* buffer, - c10::ScalarType dtype) { - TORCH_CHECK( - t.dtype().toScalarType() == dtype, - "Expected ", - dtype, - " type. Found: ", - t.dtype().toScalarType()); - TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU."); - TORCH_CHECK(t.dim() == 2, "Input Tensor has to be 2D."); - TORCH_CHECK( - t.size(1) == buffer->channels, - "Expected waveform with ", - buffer->channels, - " channels. Found ", - t.size(1)); -} - -// 2D (time, channel) and contiguous. -void convert_func_(const torch::Tensor& chunk, AVFrame* buffer) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.dim() == 2); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(chunk.size(1) == buffer->channels); - - // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00334 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); - TORCH_INTERNAL_ASSERT( - ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); - } - - auto byte_size = chunk.numel() * chunk.element_size(); - memcpy(buffer->data[0], chunk.data_ptr(), byte_size); - buffer->nb_samples = static_cast(chunk.size(0)); -} - -std::pair get_audio_func(AVFrame* buffer) { - auto dtype = [&]() -> c10::ScalarType { - switch (static_cast(buffer->format)) { - case AV_SAMPLE_FMT_U8: - return c10::ScalarType::Byte; - case AV_SAMPLE_FMT_S16: - return c10::ScalarType::Short; - case AV_SAMPLE_FMT_S32: - return c10::ScalarType::Int; - case AV_SAMPLE_FMT_S64: - return c10::ScalarType::Long; - case AV_SAMPLE_FMT_FLT: - return c10::ScalarType::Float; - case AV_SAMPLE_FMT_DBL: - return c10::ScalarType::Double; - default: - TORCH_INTERNAL_ASSERT( - false, "Audio encoding process is not properly configured."); - } - }(); - - InitFunc init_func = [=](const torch::Tensor& tensor, AVFrame* buffer) { - validate_audio_input(tensor, buffer, dtype); - return tensor.contiguous(); - }; - return {init_func, convert_func_}; -} - -//////////////////////////////////////////////////////////////////////////////// -// Video -//////////////////////////////////////////////////////////////////////////////// - -void validate_video_input( - const torch::Tensor& t, - AVFrame* buffer, - int num_channels) { - if (buffer->hw_frames_ctx) { - TORCH_CHECK(t.device().is_cuda(), "Input tensor has to be on CUDA."); - } else { - TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU."); - } - TORCH_CHECK( - t.dtype().toScalarType() == c10::ScalarType::Byte, - "Expected Tensor of uint8 type."); - - TORCH_CHECK(t.dim() == 4, "Input Tensor has to be 4D."); - TORCH_CHECK( - t.size(1) == num_channels && t.size(2) == buffer->height && - t.size(3) == buffer->width, - "Expected tensor with shape (N, ", - num_channels, - ", ", - buffer->height, - ", ", - buffer->width, - ") (NCHW format). Found ", - t.sizes()); -} - -// Special case where encode pixel format is RGB0/BGR0 but the tensor is RGB/BGR -void validate_rgb0(const torch::Tensor& t, AVFrame* buffer) { - if (buffer->hw_frames_ctx) { - TORCH_CHECK(t.device().is_cuda(), "Input tensor has to be on CUDA."); - } else { - TORCH_CHECK(t.device().is_cpu(), "Input tensor has to be on CPU."); - } - TORCH_CHECK( - t.dtype().toScalarType() == c10::ScalarType::Byte, - "Expected Tensor of uint8 type."); - - TORCH_CHECK(t.dim() == 4, "Input Tensor has to be 4D."); - TORCH_CHECK( - t.size(2) == buffer->height && t.size(3) == buffer->width, - "Expected tensor with shape (N, 3, ", - buffer->height, - ", ", - buffer->width, - ") (NCHW format). Found ", - t.sizes()); -} - -// NCHW ->NHWC, ensure contiguous -torch::Tensor init_interlaced(const torch::Tensor& tensor) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.dim() == 4); - return tensor.permute({0, 2, 3, 1}).contiguous(); -} - -// Keep NCHW, ensure contiguous -torch::Tensor init_planar(const torch::Tensor& tensor) { - return tensor.contiguous(); -} - -// Interlaced video -// Each frame is composed of one plane, and color components for each pixel are -// collocated. -// The memory layout is 1D linear, interpretated as following. -// -// |<----- linesize[0] ------>| -// |<-- stride -->| -// 0 1 ... W -// 0: RGB RGB ... RGB PAD ... PAD -// 1: RGB RGB ... RGB PAD ... PAD -// ... -// H: RGB RGB ... RGB PAD ... PAD -void write_interlaced_video( - const torch::Tensor& frame, - AVFrame* buffer, - int num_channels) { - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == buffer->height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(2) == buffer->width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels); - - // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); - TORCH_INTERNAL_ASSERT( - ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); - } - - size_t stride = buffer->width * num_channels; - uint8_t* src = frame.data_ptr(); - uint8_t* dst = buffer->data[0]; - for (int h = 0; h < buffer->height; ++h) { - std::memcpy(dst, src, stride); - src += stride; - dst += buffer->linesize[0]; - } -} - -// Planar video -// Each frame is composed of multiple planes. -// One plane can contain one of more color components. -// (but at the moment only accept formats without subsampled color components) -// -// The memory layout is interpreted as follow -// -// |<----- linesize[0] ----->| -// 0 1 ... W1 -// 0: Y Y ... Y PAD ... PAD -// 1: Y Y ... Y PAD ... PAD -// ... -// H1: Y Y ... Y PAD ... PAD -// -// |<--- linesize[1] ---->| -// 0 ... W2 -// 0: UV ... UV PAD ... PAD -// 1: UV ... UV PAD ... PAD -// ... -// H2: UV ... UV PAD ... PAD -// -void write_planar_video( - const torch::Tensor& frame, - AVFrame* buffer, - int num_planes) { - const auto num_colors = - av_pix_fmt_desc_get((AVPixelFormat)buffer->format)->nb_components; - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_colors); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(2), buffer->height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3), buffer->width); - - // https://ffmpeg.org/doxygen/4.1/muxing_8c_source.html#l00472 - if (!av_frame_is_writable(buffer)) { - int ret = av_frame_make_writable(buffer); - TORCH_INTERNAL_ASSERT( - ret >= 0, "Failed to make frame writable: ", av_err2string(ret)); - } - - for (int j = 0; j < num_colors; ++j) { - uint8_t* src = frame.index({0, j}).data_ptr(); - uint8_t* dst = buffer->data[j]; - for (int h = 0; h < buffer->height; ++h) { - memcpy(dst, src, buffer->width); - src += buffer->width; - dst += buffer->linesize[j]; - } - } -} - -void write_interlaced_video_cuda( - const torch::Tensor& frame, - AVFrame* buffer, - int num_channels) { -#ifndef USE_CUDA - TORCH_CHECK( - false, - "torchaudio is not compiled with CUDA support. Hardware acceleration is not available."); -#else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == buffer->height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(2) == buffer->width); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == num_channels); - size_t spitch = buffer->width * num_channels; - if (cudaSuccess != - cudaMemcpy2D( - (void*)(buffer->data[0]), - buffer->linesize[0], - (const void*)(frame.data_ptr()), - spitch, - spitch, - buffer->height, - cudaMemcpyDeviceToDevice)) { - TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor."); - } -#endif -} - -void write_planar_video_cuda( - const torch::Tensor& frame, - AVFrame* buffer, - int num_planes) { -#ifndef USE_CUDA - TORCH_CHECK( - false, - "torchaudio is not compiled with CUDA support. Hardware acceleration is not available."); -#else - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.dim() == 4); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(0) == 1); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(1) == num_planes); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(2) == buffer->height); - TORCH_INTERNAL_ASSERT_DEBUG_ONLY(frame.size(3) == buffer->width); - for (int j = 0; j < num_planes; ++j) { - if (cudaSuccess != - cudaMemcpy2D( - (void*)(buffer->data[j]), - buffer->linesize[j], - (const void*)(frame.index({0, j}).data_ptr()), - buffer->width, - buffer->width, - buffer->height, - cudaMemcpyDeviceToDevice)) { - TORCH_CHECK(false, "Failed to copy pixel data from CUDA tensor."); - } - } -#endif -} - -std::pair get_video_func(AVFrame* buffer) { - if (buffer->hw_frames_ctx) { - auto frames_ctx = (AVHWFramesContext*)(buffer->hw_frames_ctx->data); - auto sw_pix_fmt = frames_ctx->sw_format; - switch (sw_pix_fmt) { - case AV_PIX_FMT_RGB0: - case AV_PIX_FMT_BGR0: { - ConvertFunc convert_func = [](const torch::Tensor& t, AVFrame* f) { - write_interlaced_video_cuda(t, f, 4); - }; - InitFunc init_func = [](const torch::Tensor& t, AVFrame* f) { - // Special treatment for the case user pass regular RGB/BGR tensor. - if (t.dim() == 4 && t.size(1) == 3) { - validate_rgb0(t, f); - auto tmp = - torch::empty({t.size(0), t.size(2), t.size(3), 4}, t.options()); - tmp.index_put_({"...", Slice(0, 3)}, t.permute({0, 2, 3, 1})); - return tmp; - } - validate_video_input(t, f, 4); - return init_interlaced(t); - }; - return {init_func, convert_func}; - } - case AV_PIX_FMT_GBRP: - case AV_PIX_FMT_GBRP16LE: - case AV_PIX_FMT_YUV444P: - case AV_PIX_FMT_YUV444P16LE: { - ConvertFunc convert_func = [](const torch::Tensor& t, AVFrame* f) { - write_planar_video_cuda(t, f, 3); - }; - InitFunc init_func = [](const torch::Tensor& t, AVFrame* f) { - validate_video_input(t, f, 3); - return init_planar(t); - }; - return {init_func, convert_func}; - } - default: - TORCH_CHECK( - false, - "Unexpected pixel format for CUDA: ", - av_get_pix_fmt_name(sw_pix_fmt)); - } - } - - auto pix_fmt = static_cast(buffer->format); - switch (pix_fmt) { - case AV_PIX_FMT_GRAY8: - case AV_PIX_FMT_RGB24: - case AV_PIX_FMT_BGR24: { - int channels = av_pix_fmt_desc_get(pix_fmt)->nb_components; - InitFunc init_func = [=](const torch::Tensor& t, AVFrame* f) { - validate_video_input(t, f, channels); - return init_interlaced(t); - }; - ConvertFunc convert_func = [=](const torch::Tensor& t, AVFrame* f) { - write_interlaced_video(t, f, channels); - }; - return {init_func, convert_func}; - } - case AV_PIX_FMT_RGB0: - case AV_PIX_FMT_BGR0: { - InitFunc init_func = [](const torch::Tensor& t, AVFrame* f) { - if (t.dim() == 4 && t.size(1) == 3) { - validate_rgb0(t, f); - auto tmp = - torch::empty({t.size(0), t.size(2), t.size(3), 4}, t.options()); - tmp.index_put_({"...", Slice(0, 3)}, t.permute({0, 2, 3, 1})); - return tmp; - } - validate_video_input(t, f, 4); - return init_interlaced(t); - }; - ConvertFunc convert_func = [](const torch::Tensor& t, AVFrame* f) { - write_interlaced_video(t, f, 4); - }; - return {init_func, convert_func}; - } - case AV_PIX_FMT_YUV444P: { - InitFunc init_func = [](const torch::Tensor& t, AVFrame* f) { - validate_video_input(t, f, 3); - return init_planar(t); - }; - ConvertFunc convert_func = [](const torch::Tensor& t, AVFrame* f) { - write_planar_video(t, f, 3); - }; - return {init_func, convert_func}; - } - default: - TORCH_CHECK( - false, "Unexpected pixel format: ", av_get_pix_fmt_name(pix_fmt)); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Unknown (for supporting frame writing) -//////////////////////////////////////////////////////////////////////////////// -std::pair get_frame_func() { - InitFunc init_func = [](const torch::Tensor& tensor, - AVFrame* buffer) -> torch::Tensor { - TORCH_CHECK( - false, - "This shouldn't have been called. " - "If you intended to write frames, please select a stream that supports doing so."); - }; - ConvertFunc convert_func = [](const torch::Tensor& tensor, AVFrame* buffer) { - TORCH_CHECK( - false, - "This shouldn't have been called. " - "If you intended to write frames, please select a stream that supports doing so."); - }; - return {init_func, convert_func}; -} - -} // namespace - -//////////////////////////////////////////////////////////////////////////////// -// TensorConverter -//////////////////////////////////////////////////////////////////////////////// - -TensorConverter::TensorConverter(AVMediaType type, AVFrame* buf, int buf_size) - : buffer(buf), buffer_size(buf_size) { - switch (type) { - case AVMEDIA_TYPE_AUDIO: - std::tie(init_func, convert_func) = get_audio_func(buffer); - break; - case AVMEDIA_TYPE_VIDEO: - std::tie(init_func, convert_func) = get_video_func(buffer); - break; - case AVMEDIA_TYPE_UNKNOWN: - std::tie(init_func, convert_func) = get_frame_func(); - break; - default: - TORCH_INTERNAL_ASSERT( - false, "Unsupported media type: ", av_get_media_type_string(type)); - } -} - -using Generator = TensorConverter::Generator; - -Generator TensorConverter::convert(const torch::Tensor& t) { - return Generator{init_func(t, buffer), buffer, convert_func, buffer_size}; -} - -//////////////////////////////////////////////////////////////////////////////// -// Generator -//////////////////////////////////////////////////////////////////////////////// - -using Iterator = Generator::Iterator; - -Generator::Generator( - torch::Tensor frames_, - AVFrame* buff, - ConvertFunc& func, - int64_t step_) - : frames(std::move(frames_)), - buffer(buff), - convert_func(func), - step(step_) {} - -Iterator Generator::begin() const { - return Iterator{frames, buffer, convert_func, step}; -} - -int64_t Generator::end() const { - return frames.size(0); -} - -//////////////////////////////////////////////////////////////////////////////// -// Iterator -//////////////////////////////////////////////////////////////////////////////// - -Iterator::Iterator( - const torch::Tensor frames_, - AVFrame* buffer_, - ConvertFunc& convert_func_, - int64_t step_) - : frames(frames_), - buffer(buffer_), - convert_func(convert_func_), - step(step_) {} - -Iterator& Iterator::operator++() { - i += step; - return *this; -} - -AVFrame* Iterator::operator*() const { - using namespace torch::indexing; - convert_func(frames.index({Slice{i, i + step}}), buffer); - return buffer; -} - -bool Iterator::operator!=(const int64_t end) const { - // This is used for detecting the end of iteraton. - // For audio, iteration is done by - return i < end; -} - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/tensor_converter.h b/src/libtorio/ffmpeg/stream_writer/tensor_converter.h deleted file mode 100644 index b6015889a3..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/tensor_converter.h +++ /dev/null @@ -1,95 +0,0 @@ -#pragma once - -#include -#include - -namespace torio::io { - -class TensorConverter { - public: - // Initialization is one-time process applied to frames before the iteration - // starts. i.e. either convert to NHWC. - using InitFunc = std::function; - // Convert function writes input frame Tensor to destinatoin AVFrame - // both tensor input and AVFrame are expected to be valid and properly - // allocated. (i.e. glorified copy). It is used in Iterator. - using ConvertFunc = std::function; - - ////////////////////////////////////////////////////////////////////////////// - // Generator - ////////////////////////////////////////////////////////////////////////////// - // Generator class is responsible for implementing an interface - // compatible with range-based for loop interface (begin and end). - class Generator { - public: - //////////////////////////////////////////////////////////////////////////// - // Iterator - //////////////////////////////////////////////////////////////////////////// - // Iterator class is responsible for implementing iterator protocol, that is - // increment, comaprison against, and dereference (applying conversion - // function in it). - class Iterator { - // Tensor to be sliced - // - audio: NC, CPU, uint8|int16|float|double - // - video: NCHW or NHWC, CPU or CUDA, uint8 - // It will be sliced at dereference time. - const torch::Tensor frames; - // Output buffer (not owned, but modified by Iterator) - AVFrame* buffer; - // Function that converts one frame Tensor into AVFrame. - ConvertFunc& convert_func; - - // Index - int64_t step; - int64_t i = 0; - - public: - Iterator( - const torch::Tensor tensor, - AVFrame* buffer, - ConvertFunc& convert_func, - int64_t step); - - Iterator& operator++(); - AVFrame* operator*() const; - bool operator!=(const int64_t other) const; - }; - - private: - // Input Tensor: - // - video: NCHW, CPU|CUDA, uint8, - // - audio: NC, CPU, uin8|int16|int32|in64|float32|double - torch::Tensor frames; - - // Output buffer (not owned, passed to iterator) - AVFrame* buffer; - - // ops: not owned. - ConvertFunc& convert_func; - - int64_t step; - - public: - Generator( - torch::Tensor frames, - AVFrame* buffer, - ConvertFunc& convert_func, - int64_t step = 1); - - [[nodiscard]] Iterator begin() const; - [[nodiscard]] int64_t end() const; - }; - - private: - AVFrame* buffer; - const int buffer_size = 1; - - InitFunc init_func{}; - ConvertFunc convert_func{}; - - public: - TensorConverter(AVMediaType type, AVFrame* buffer, int buffer_size = 1); - Generator convert(const torch::Tensor& t); -}; - -} // namespace torio::io diff --git a/src/libtorio/ffmpeg/stream_writer/types.h b/src/libtorio/ffmpeg/stream_writer/types.h deleted file mode 100644 index 567af8e486..0000000000 --- a/src/libtorio/ffmpeg/stream_writer/types.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once -namespace torio::io { - -struct CodecConfig { - int bit_rate = -1; - int compression_level = -1; - - // qscale corresponds to ffmpeg CLI's qscale. - // Example: MP3 - // https://trac.ffmpeg.org/wiki/Encode/MP3 - // This should be set like - // https://github.com/FFmpeg/FFmpeg/blob/n4.3.2/fftools/ffmpeg_opt.c#L1550 - const std::optional qscale = -1; - - // video - int gop_size = -1; - int max_b_frames = -1; -}; -} // namespace torio::io diff --git a/src/torio/__init__.py b/src/torio/__init__.py deleted file mode 100644 index 23efa0b2fd..0000000000 --- a/src/torio/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from . import _extension # noqa # usort: skip -from . import io, utils - - -__all__ = [ - "io", - "utils", -] diff --git a/src/torio/_extension/__init__.py b/src/torio/_extension/__init__.py deleted file mode 100644 index f11ace8831..0000000000 --- a/src/torio/_extension/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from .utils import _init_ffmpeg, _LazyImporter - - -_FFMPEG_EXT = None - - -def lazy_import_ffmpeg_ext(): - """Load FFmpeg integration based on availability in lazy manner""" - - global _FFMPEG_EXT - if _FFMPEG_EXT is None: - _FFMPEG_EXT = _LazyImporter("_torio_ffmpeg", _init_ffmpeg) - return _FFMPEG_EXT diff --git a/src/torio/_extension/utils.py b/src/torio/_extension/utils.py deleted file mode 100644 index c72d59c16f..0000000000 --- a/src/torio/_extension/utils.py +++ /dev/null @@ -1,147 +0,0 @@ -import importlib -import logging -import os -import types -from pathlib import Path - -import torch - -_LG = logging.getLogger(__name__) -_LIB_DIR = Path(__file__).parent.parent / "lib" - - -class _LazyImporter(types.ModuleType): - """Lazily import module/extension.""" - - def __init__(self, name, import_func): - super().__init__(name) - self.import_func = import_func - self.module = None - - # Note: - # Python caches what was retrieved with `__getattr__`, so this method will not be - # called again for the same item. - def __getattr__(self, item): - self._import_once() - return getattr(self.module, item) - - def __repr__(self): - if self.module is None: - return f"" - return repr(self.module) - - def __dir__(self): - self._import_once() - return dir(self.module) - - def _import_once(self): - if self.module is None: - self.module = self.import_func() - # Note: - # By attaching the module attributes to self, - # module attributes are directly accessible. - # This allows to avoid calling __getattr__ for every attribute access. - self.__dict__.update(self.module.__dict__) - - def is_available(self): - try: - self._import_once() - except Exception: - return False - return True - - -def _get_lib_path(lib: str): - suffix = "pyd" if os.name == "nt" else "so" - path = _LIB_DIR / f"{lib}.{suffix}" - return path - - -def _load_lib(lib: str) -> bool: - """Load extension module - - Note: - In case `torio` is deployed with `pex` format, the library file - is not in a standard location. - In this case, we expect that `libtorio` is available somewhere - in the search path of dynamic loading mechanism, so that importing - `_torio` will have library loader find and load `libtorio`. - This is the reason why the function should not raising an error when the library - file is not found. - - Returns: - bool: - True if the library file is found AND the library loaded without failure. - False if the library file is not found (like in the case where torio - is deployed with pex format, thus the shared library file is - in a non-standard location.). - If the library file is found but there is an issue loading the library, - (such as missing dependency) then this function raises the exception as-is. - - Raises: - Exception: - If the library file is found, but there is an issue loading the library file, - (when underlying `ctype.DLL` throws an exception), this function will pass - the exception as-is, instead of catching it and returning bool. - The expected case is `OSError` thrown by `ctype.DLL` when a dynamic dependency - is not found. - This behavior was chosen because the expected failure case is not recoverable. - If a dependency is missing, then users have to install it. - """ - path = _get_lib_path(lib) - if not path.exists(): - return False - torch.ops.load_library(path) - return True - - -_FFMPEG_VERS = ["6", "5", "4", ""] - - -def _find_versionsed_ffmpeg_extension(version: str): - ext = f"torio.lib._torio_ffmpeg{version}" - lib = f"libtorio_ffmpeg{version}" - - if not importlib.util.find_spec(ext): - raise RuntimeError(f"FFmpeg{version} extension is not available.") - - _load_lib(lib) - return importlib.import_module(ext) - - -def _find_ffmpeg_extension(ffmpeg_vers): - for ffmpeg_ver in ffmpeg_vers: - _LG.debug("Loading FFmpeg%s", ffmpeg_ver) - try: - ext = _find_versionsed_ffmpeg_extension(ffmpeg_ver) - _LG.debug("Successfully loaded FFmpeg%s", ffmpeg_ver) - return ext - except Exception: - _LG.debug("Failed to load FFmpeg%s extension.", ffmpeg_ver, exc_info=True) - continue - raise ImportError( - f"Failed to intialize FFmpeg extension. Tried versions: {ffmpeg_vers}. " - "Enable DEBUG logging to see more details about the error." - ) - - -def _get_ffmpeg_versions(): - ffmpeg_vers = _FFMPEG_VERS - # User override - if (ffmpeg_ver := os.environ.get("TORIO_USE_FFMPEG_VERSION")) is not None: - if ffmpeg_ver not in ffmpeg_vers: - raise ValueError( - f"The FFmpeg version '{ffmpeg_ver}' (read from TORIO_USE_FFMPEG_VERSION) " - f"is not one of supported values. Possible values are {ffmpeg_vers}" - ) - ffmpeg_vers = [ffmpeg_ver] - return ffmpeg_vers - - -def _init_ffmpeg(): - ffmpeg_vers = _get_ffmpeg_versions() - ext = _find_ffmpeg_extension(ffmpeg_vers) - ext.init() - if ext.get_log_level() > 8: - ext.set_log_level(8) - return ext diff --git a/src/torio/io/__init__.py b/src/torio/io/__init__.py deleted file mode 100644 index 7fce6d7752..0000000000 --- a/src/torio/io/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from ._streaming_media_decoder import StreamingMediaDecoder -from ._streaming_media_encoder import CodecConfig, StreamingMediaEncoder - - -__all__ = [ - "StreamingMediaDecoder", - "CodecConfig", - "StreamingMediaEncoder", -] diff --git a/src/torio/io/_streaming_media_decoder.py b/src/torio/io/_streaming_media_decoder.py deleted file mode 100644 index b3d7fc538b..0000000000 --- a/src/torio/io/_streaming_media_decoder.py +++ /dev/null @@ -1,977 +0,0 @@ -from __future__ import annotations - -import os -from dataclasses import dataclass -from pathlib import Path -from typing import BinaryIO, Dict, Iterator, Optional, Tuple, TypeVar, Union - -import torch -import torio -from torch.utils._pytree import tree_map - -ffmpeg_ext = torio._extension.lazy_import_ffmpeg_ext() - -__all__ = [ - "StreamingMediaDecoder", -] - - -@dataclass -class SourceStream: - """The metadata of a source stream, returned by :meth:`~torio.io.StreamingMediaDecoder.get_src_stream_info`. - - This class is used when representing streams of media type other than `audio` or `video`. - - When source stream is `audio` or `video` type, :class:`SourceAudioStream` and - :class:`SourceVideoStream`, which reports additional media-specific attributes, - are used respectively. - """ - - media_type: str - """The type of the stream. - One of ``"audio"``, ``"video"``, ``"data"``, ``"subtitle"``, ``"attachment"`` and empty string. - - .. note:: - Only audio and video streams are supported for output. - .. note:: - Still images, such as PNG and JPEG formats are reported as video. - """ - codec: str - """Short name of the codec. Such as ``"pcm_s16le"`` and ``"h264"``.""" - codec_long_name: str - """Detailed name of the codec. - - Such as "`PCM signed 16-bit little-endian`" and "`H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10`". - """ - format: Optional[str] - """Media format. Such as ``"s16"`` and ``"yuv420p"``. - - Commonly found audio values are; - - - ``"u8"``, ``"u8p"``: Unsigned 8-bit unsigned interger. - - ``"s16"``, ``"s16p"``: 16-bit signed integer. - - ``"s32"``, ``"s32p"``: 32-bit signed integer. - - ``"flt"``, ``"fltp"``: 32-bit floating-point. - - .. note:: - - `p` at the end indicates the format is `planar`. - Channels are grouped together instead of interspersed in memory. - """ - bit_rate: Optional[int] - """Bit rate of the stream in bits-per-second. - This is an estimated values based on the initial few frames of the stream. - For container formats and variable bit rate, it can be 0. - """ - num_frames: Optional[int] - """The number of frames in the stream""" - bits_per_sample: Optional[int] - """This is the number of valid bits in each output sample. - For compressed format, it can be 0. - """ - metadata: Dict[str, str] - """Metadata attached to the source stream.""" - - -@dataclass -class SourceAudioStream(SourceStream): - """The metadata of an audio source stream, returned by :meth:`~torio.io.StreamingMediaDecoder.get_src_stream_info`. - - This class is used when representing audio stream. - - In addition to the attributes reported by :class:`SourceStream`, - the following attributes are reported. - """ - - sample_rate: float - """Sample rate of the audio.""" - num_channels: int - """Number of channels.""" - - -@dataclass -class SourceVideoStream(SourceStream): - """The metadata of a video source stream, returned by :meth:`~torio.io.StreamingMediaDecoder.get_src_stream_info`. - - This class is used when representing video stream. - - In addition to the attributes reported by :class:`SourceStream`, - the following attributes are reported. - """ - - width: int - """Width of the video frame in pixel.""" - height: int - """Height of the video frame in pixel.""" - frame_rate: float - """Frame rate.""" - - -def _parse_si(i): - media_type = i.media_type - if media_type == "audio": - return SourceAudioStream( - media_type=i.media_type, - codec=i.codec_name, - codec_long_name=i.codec_long_name, - format=i.format, - bit_rate=i.bit_rate, - num_frames=i.num_frames, - bits_per_sample=i.bits_per_sample, - metadata=i.metadata, - sample_rate=i.sample_rate, - num_channels=i.num_channels, - ) - if media_type == "video": - return SourceVideoStream( - media_type=i.media_type, - codec=i.codec_name, - codec_long_name=i.codec_long_name, - format=i.format, - bit_rate=i.bit_rate, - num_frames=i.num_frames, - bits_per_sample=i.bits_per_sample, - metadata=i.metadata, - width=i.width, - height=i.height, - frame_rate=i.frame_rate, - ) - return SourceStream( - media_type=i.media_type, - codec=i.codec_name, - codec_long_name=i.codec_long_name, - format=None, - bit_rate=None, - num_frames=None, - bits_per_sample=None, - metadata=i.metadata, - ) - - -@dataclass -class OutputStream: - """Output stream configured on :class:`StreamingMediaDecoder`, - returned by :meth:`~torio.io.StreamingMediaDecoder.get_out_stream_info`. - """ - - source_index: int - """Index of the source stream that this output stream is connected.""" - filter_description: str - """Description of filter graph applied to the source stream.""" - media_type: str - """The type of the stream. ``"audio"`` or ``"video"``.""" - format: str - """Media format. Such as ``"s16"`` and ``"yuv420p"``. - - Commonly found audio values are; - - - ``"u8"``, ``"u8p"``: Unsigned 8-bit unsigned interger. - - ``"s16"``, ``"s16p"``: 16-bit signed integer. - - ``"s32"``, ``"s32p"``: 32-bit signed integer. - - ``"flt"``, ``"fltp"``: 32-bit floating-point. - - .. note:: - - `p` at the end indicates the format is `planar`. - Channels are grouped together instead of interspersed in memory.""" - - -@dataclass -class OutputAudioStream(OutputStream): - """Information about an audio output stream configured with - :meth:`~torio.io.StreamingMediaDecoder.add_audio_stream` or - :meth:`~torio.io.StreamingMediaDecoder.add_basic_audio_stream`. - - In addition to the attributes reported by :class:`OutputStream`, - the following attributes are reported. - """ - - sample_rate: float - """Sample rate of the audio.""" - num_channels: int - """Number of channels.""" - - -@dataclass -class OutputVideoStream(OutputStream): - """Information about a video output stream configured with - :meth:`~torio.io.StreamingMediaDecoder.add_video_stream` or - :meth:`~torio.io.StreamingMediaDecoder.add_basic_video_stream`. - - In addition to the attributes reported by :class:`OutputStream`, - the following attributes are reported. - """ - - width: int - """Width of the video frame in pixel.""" - height: int - """Height of the video frame in pixel.""" - frame_rate: float - """Frame rate.""" - - -def _parse_oi(i): - media_type = i.media_type - if media_type == "audio": - return OutputAudioStream( - source_index=i.source_index, - filter_description=i.filter_description, - media_type=i.media_type, - format=i.format, - sample_rate=i.sample_rate, - num_channels=i.num_channels, - ) - if media_type == "video": - return OutputVideoStream( - source_index=i.source_index, - filter_description=i.filter_description, - media_type=i.media_type, - format=i.format, - width=i.width, - height=i.height, - frame_rate=i.frame_rate, - ) - raise ValueError(f"Unexpected media_type: {i.media_type}({i})") - - -def _get_afilter_desc(sample_rate: Optional[int], fmt: Optional[str], num_channels: Optional[int]): - descs = [] - if sample_rate is not None: - descs.append(f"aresample={sample_rate}") - if fmt is not None or num_channels is not None: - parts = [] - if fmt is not None: - parts.append(f"sample_fmts={fmt}") - if num_channels is not None: - parts.append(f"channel_layouts={num_channels}c") - descs.append(f"aformat={':'.join(parts)}") - return ",".join(descs) if descs else None - - -def _get_vfilter_desc(frame_rate: Optional[float], width: Optional[int], height: Optional[int], fmt: Optional[str]): - descs = [] - if frame_rate is not None: - descs.append(f"fps={frame_rate}") - scales = [] - if width is not None: - scales.append(f"width={width}") - if height is not None: - scales.append(f"height={height}") - if scales: - descs.append(f"scale={':'.join(scales)}") - if fmt is not None: - descs.append(f"format=pix_fmts={fmt}") - return ",".join(descs) if descs else None - - -# Base class for ChunkTensor -# Based off of TrivialTensorViaComposition -# https://github.com/albanD/subclass_zoo/blob/0eeb1d68fb59879029c610bc407f2997ae43ba0a/trivial_tensors.py#L83 -class ChunkTensorBase(torch.Tensor): - __torch_function__ = torch._C._disabled_torch_function_impl - - @staticmethod - def __new__(cls, _elem, *_): - return super().__new__(cls, _elem) - - @classmethod - def __torch_dispatch__(cls, func, _, args=(), kwargs=None): - def unwrap(t): - return t._elem if isinstance(t, cls) else t - - return func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)) - - -@dataclass -class ChunkTensor(ChunkTensorBase): - """Decoded media frames with metadata. - - The instance of this class represents the decoded video/audio frames with - metadata, and the instance itself behave like :py:class:`~torch.Tensor`. - - Client codes can pass instance of this class as-if it's - :py:class:`~torch.Tensor` class, or call the methods defined on - :py:class:`~torch.Tensor` class. - - Example: - >>> # Define input streams - >>> reader = StreamingMediaDecoder(...) - >>> reader.add_audio_stream(frames_per_chunk=4000, sample_rate=8000) - >>> reader.add_video_stream(frames_per_chunk=7, frame_rate=28) - >>> # Decode the streams and fetch frames - >>> reader.fill_buffer() - >>> audio_chunk, video_chunk = reader.pop_chunks() - - >>> # Access metadata - >>> (audio_chunk.pts, video_chunks.pts) - (0.0, 0.0) - >>> - >>> # The second time the PTS is different - >>> reader.fill_buffer() - >>> audio_chunk, video_chunk = reader.pop_chunks() - >>> (audio_chunk.pts, video_chunks.pts) - (0.5, 0.25) - - >>> # Call PyTorch ops on chunk - >>> audio_chunk.shape - torch.Size([4000, 2] - >>> power = torch.pow(video_chunk, 2) - >>> - >>> # the result is a plain torch.Tensor class - >>> type(power) - - >>> - >>> # Metadata is not available on the result - >>> power.pts - AttributeError: 'Tensor' object has no attribute 'pts' - """ - - # Keep it private for now - _elem: torch.Tensor - - pts: float - """Presentation time stamp of the first frame in the chunk. - - Unit: second. - """ - - -def _format_doc(**kwargs): - def decorator(obj): - obj.__doc__ = obj.__doc__.format(**kwargs) - return obj - - return decorator - - -_frames_per_chunk = """Number of frames returned as one chunk. - If the source stream is exhausted before enough frames are buffered, - then the chunk is returned as-is. - - Providing ``-1`` disables chunking and :py:func:`pop_chunks` method - will concatenate all the buffered frames and return it.""" - -_buffer_chunk_size = """Internal buffer size. - When the number of chunks buffered exceeds this number, old frames are - dropped. For example, if ``frames_per_chunk`` is 5 and ``buffer_chunk_size`` is - 3, then frames older than ``15`` are dropped. - Providing ``-1`` disables this behavior. - - Default: ``3``.""" - -_audio_stream_index = """The source audio stream index. - If omitted, :py:attr:`default_audio_stream` is used.""" - - -_video_stream_index = """The source video stream index. - If omitted, :py:attr:`default_video_stream` is used.""" - -_decoder = """The name of the decoder to be used. - When provided, use the specified decoder instead of the default one. - - To list the available decoders, please use - :py:func:`~torio.utils.ffmpeg_utils.get_audio_decoders` for audio, and - :py:func:`~torio.utils.ffmpeg_utils.get_video_decoders` for video. - - Default: ``None``.""" - -_decoder_option = """Options passed to decoder. - Mapping from str to str. (Default: ``None``) - - To list decoder options for a decoder, you can use - ``ffmpeg -h decoder=`` command. - - | - - In addition to decoder-specific options, you can also pass options related - to multithreading. They are effective only if the decoder support them. - If neither of them are provided, StreamingMediaDecoder defaults to single thread. - - ``"threads"``: The number of threads (in str). - Providing the value ``"0"`` will let FFmpeg decides based on its heuristics. - - ``"thread_type"``: Which multithreading method to use. - The valid values are ``"frame"`` or ``"slice"``. - Note that each decoder supports different set of methods. - If not provided, a default value is used. - - - ``"frame"``: Decode more than one frame at once. - Each thread handles one frame. - This will increase decoding delay by one frame per thread - - ``"slice"``: Decode more than one part of a single frame at once. - - | - """ - - -_hw_accel = """Enable hardware acceleration. - - When video is decoded on CUDA hardware, for example - `decoder="h264_cuvid"`, passing CUDA device indicator to `hw_accel` - (i.e. `hw_accel="cuda:0"`) will make StreamingMediaDecoder place the resulting - frames directly on the specified CUDA device as CUDA tensor. - - If `None`, the frame will be moved to CPU memory. - Default: ``None``.""" - - -_format_audio_args = _format_doc( - frames_per_chunk=_frames_per_chunk, - buffer_chunk_size=_buffer_chunk_size, - stream_index=_audio_stream_index, - decoder=_decoder, - decoder_option=_decoder_option, -) - - -_format_video_args = _format_doc( - frames_per_chunk=_frames_per_chunk, - buffer_chunk_size=_buffer_chunk_size, - stream_index=_video_stream_index, - decoder=_decoder, - decoder_option=_decoder_option, - hw_accel=_hw_accel, -) - - -InputStreamTypes = TypeVar("InputStream", bound=SourceStream) -OutputStreamTypes = TypeVar("OutputStream", bound=OutputStream) - -class StreamingMediaDecoder: - """Fetch and decode audio/video streams chunk by chunk. - - For the detailed usage of this class, please refer to the tutorial. - - Args: - src (str, path-like, bytes or file-like object): The media source. - If string-type, it must be a resource indicator that FFmpeg can - handle. This includes a file path, URL, device identifier or - filter expression. The supported value depends on the FFmpeg found - in the system. - - If bytes, it must be an encoded media data in contiguous memory. - - If file-like object, it must support `read` method with the signature - `read(size: int) -> bytes`. - Additionally, if the file-like object has `seek` method, it uses - the method when parsing media metadata. This improves the reliability - of codec detection. The signagure of `seek` method must be - `seek(offset: int, whence: int) -> int`. - - Please refer to the following for the expected signature and behavior - of `read` and `seek` method. - - - https://docs.python.org/3/library/io.html#io.BufferedIOBase.read - - https://docs.python.org/3/library/io.html#io.IOBase.seek - - format (str or None, optional): - Override the input format, or specify the source sound device. - Default: ``None`` (no override nor device input). - - This argument serves two different usecases. - - 1) Override the source format. - This is useful when the input data do not contain a header. - - 2) Specify the input source device. - This allows to load media stream from hardware devices, - such as microphone, camera and screen, or a virtual device. - - - .. note:: - - This option roughly corresponds to ``-f`` option of ``ffmpeg`` command. - Please refer to the ffmpeg documentations for the possible values. - - https://ffmpeg.org/ffmpeg-formats.html#Demuxers - - Please use :py:func:`~torio.utils.ffmpeg_utils.get_demuxers` to list the - demultiplexers available in the current environment. - - For device access, the available values vary based on hardware (AV device) and - software configuration (ffmpeg build). - - https://ffmpeg.org/ffmpeg-devices.html#Input-Devices - - Please use :py:func:`~torio.utils.ffmpeg_utils.get_input_devices` to list - the input devices available in the current environment. - - option (dict of str to str, optional): - Custom option passed when initializing format context (opening source). - - You can use this argument to change the input source before it is passed to decoder. - - Default: ``None``. - - buffer_size (int): - The internal buffer size in byte. Used only when `src` is file-like object. - - Default: `4096`. - """ - - def __init__( - self, - src: Union[str, Path, BinaryIO], - format: Optional[str] = None, - option: Optional[Dict[str, str]] = None, - buffer_size: int = 4096, - ): - self.src = src - if isinstance(src, bytes): - self._be = ffmpeg_ext.StreamingMediaDecoderBytes(src, format, option, buffer_size) - elif hasattr(src, "read"): - self._be = ffmpeg_ext.StreamingMediaDecoderFileObj(src, format, option, buffer_size) - else: - self._be = ffmpeg_ext.StreamingMediaDecoder(os.path.normpath(src), format, option) - - i = self._be.find_best_audio_stream() - self._default_audio_stream = None if i < 0 else i - i = self._be.find_best_video_stream() - self._default_video_stream = None if i < 0 else i - - @property - def num_src_streams(self): - """Number of streams found in the provided media source. - - :type: int - """ - return self._be.num_src_streams() - - @property - def num_out_streams(self): - """Number of output streams configured by client code. - - :type: int - """ - return self._be.num_out_streams() - - @property - def default_audio_stream(self): - """The index of default audio stream. ``None`` if there is no audio stream - - :type: Optional[int] - """ - return self._default_audio_stream - - @property - def default_video_stream(self): - """The index of default video stream. ``None`` if there is no video stream - - :type: Optional[int] - """ - return self._default_video_stream - - def get_metadata(self) -> Dict[str, str]: - """Get the metadata of the source media. - - Returns: - dict - """ - return self._be.get_metadata() - - def get_src_stream_info(self, i: int) -> InputStreamTypes: - """Get the metadata of source stream - - Args: - i (int): Stream index. - Returns: - InputStreamTypes: - Information about the source stream. - If the source stream is audio type, then - :class:`~torio.io._stream_reader.SourceAudioStream` is returned. - If it is video type, then - :class:`~torio.io._stream_reader.SourceVideoStream` is returned. - Otherwise :class:`~torio.io._stream_reader.SourceStream` class is returned. - """ - return _parse_si(self._be.get_src_stream_info(i)) - - def get_out_stream_info(self, i: int) -> OutputStreamTypes: - """Get the metadata of output stream - - Args: - i (int): Stream index. - Returns: - OutputStreamTypes - Information about the output stream. - If the output stream is audio type, then - :class:`~torio.io._stream_reader.OutputAudioStream` is returned. - If it is video type, then - :class:`~torio.io._stream_reader.OutputVideoStream` is returned. - """ - info = self._be.get_out_stream_info(i) - return _parse_oi(info) - - def seek(self, timestamp: float, mode: str = "precise"): - """Seek the stream to the given timestamp [second] - - Args: - timestamp (float): Target time in second. - mode (str): Controls how seek is done. - Valid choices are; - - * "key": Seek into the nearest key frame before the given timestamp. - * "any": Seek into any frame (including non-key frames) before the given timestamp. - * "precise": First seek into the nearest key frame before the given timestamp, then - decode frames until it reaches the closes frame to the given timestamp. - - Note: - All the modes invalidate and reset the internal state of decoder. - When using "any" mode and if it ends up seeking into non-key frame, - the image decoded may be invalid due to lack of key frame. - Using "precise" will workaround this issue by decoding frames from previous - key frame, but will be slower. - """ - modes = { - "key": 0, - "any": 1, - "precise": 2, - } - if mode not in modes: - raise ValueError(f"The value of mode must be one of {list(modes.keys())}. Found: {mode}") - self._be.seek(timestamp, modes[mode]) - - @_format_audio_args - def add_basic_audio_stream( - self, - frames_per_chunk: int, - buffer_chunk_size: int = 3, - *, - stream_index: Optional[int] = None, - decoder: Optional[str] = None, - decoder_option: Optional[Dict[str, str]] = None, - format: Optional[str] = "fltp", - sample_rate: Optional[int] = None, - num_channels: Optional[int] = None, - ): - """Add output audio stream - - Args: - frames_per_chunk (int): {frames_per_chunk} - - buffer_chunk_size (int, optional): {buffer_chunk_size} - - stream_index (int or None, optional): {stream_index} - - decoder (str or None, optional): {decoder} - - decoder_option (dict or None, optional): {decoder_option} - - format (str, optional): Output sample format (precision). - - If ``None``, the output chunk has dtype corresponding to - the precision of the source audio. - - Otherwise, the sample is converted and the output dtype is changed - as following. - - - ``"u8p"``: The output is ``torch.uint8`` type. - - ``"s16p"``: The output is ``torch.int16`` type. - - ``"s32p"``: The output is ``torch.int32`` type. - - ``"s64p"``: The output is ``torch.int64`` type. - - ``"fltp"``: The output is ``torch.float32`` type. - - ``"dblp"``: The output is ``torch.float64`` type. - - Default: ``"fltp"``. - - sample_rate (int or None, optional): If provided, resample the audio. - - num_channels (int, or None, optional): If provided, change the number of channels. - """ - self.add_audio_stream( - frames_per_chunk, - buffer_chunk_size, - stream_index=stream_index, - decoder=decoder, - decoder_option=decoder_option, - filter_desc=_get_afilter_desc(sample_rate, format, num_channels), - ) - - @_format_video_args - def add_basic_video_stream( - self, - frames_per_chunk: int, - buffer_chunk_size: int = 3, - *, - stream_index: Optional[int] = None, - decoder: Optional[str] = None, - decoder_option: Optional[Dict[str, str]] = None, - format: Optional[str] = "rgb24", - frame_rate: Optional[int] = None, - width: Optional[int] = None, - height: Optional[int] = None, - hw_accel: Optional[str] = None, - ): - """Add output video stream - - Args: - frames_per_chunk (int): {frames_per_chunk} - - buffer_chunk_size (int, optional): {buffer_chunk_size} - - stream_index (int or None, optional): {stream_index} - - decoder (str or None, optional): {decoder} - - decoder_option (dict or None, optional): {decoder_option} - - format (str, optional): Change the format of image channels. Valid values are, - - - ``"rgb24"``: 8 bits * 3 channels (R, G, B) - - ``"bgr24"``: 8 bits * 3 channels (B, G, R) - - ``"yuv420p"``: 8 bits * 3 channels (Y, U, V) - - ``"gray"``: 8 bits * 1 channels - - Default: ``"rgb24"``. - - frame_rate (int or None, optional): If provided, change the frame rate. - - width (int or None, optional): If provided, change the image width. Unit: Pixel. - - height (int or None, optional): If provided, change the image height. Unit: Pixel. - - hw_accel (str or None, optional): {hw_accel} - """ - self.add_video_stream( - frames_per_chunk, - buffer_chunk_size, - stream_index=stream_index, - decoder=decoder, - decoder_option=decoder_option, - filter_desc=_get_vfilter_desc(frame_rate, width, height, format), - hw_accel=hw_accel, - ) - - @_format_audio_args - def add_audio_stream( - self, - frames_per_chunk: int, - buffer_chunk_size: int = 3, - *, - stream_index: Optional[int] = None, - decoder: Optional[str] = None, - decoder_option: Optional[Dict[str, str]] = None, - filter_desc: Optional[str] = None, - ): - """Add output audio stream - - Args: - frames_per_chunk (int): {frames_per_chunk} - - buffer_chunk_size (int, optional): {buffer_chunk_size} - - stream_index (int or None, optional): {stream_index} - - decoder (str or None, optional): {decoder} - - decoder_option (dict or None, optional): {decoder_option} - - filter_desc (str or None, optional): Filter description. - The list of available filters can be found at - https://ffmpeg.org/ffmpeg-filters.html - Note that complex filters are not supported. - - """ - i = self.default_audio_stream if stream_index is None else stream_index - if i is None: - raise RuntimeError("There is no audio stream.") - self._be.add_audio_stream( - i, - frames_per_chunk, - buffer_chunk_size, - filter_desc, - decoder, - decoder_option or {}, - ) - - @_format_video_args - def add_video_stream( - self, - frames_per_chunk: int, - buffer_chunk_size: int = 3, - *, - stream_index: Optional[int] = None, - decoder: Optional[str] = None, - decoder_option: Optional[Dict[str, str]] = None, - filter_desc: Optional[str] = None, - hw_accel: Optional[str] = None, - ): - """Add output video stream - - Args: - frames_per_chunk (int): {frames_per_chunk} - - buffer_chunk_size (int, optional): {buffer_chunk_size} - - stream_index (int or None, optional): {stream_index} - - decoder (str or None, optional): {decoder} - - decoder_option (dict or None, optional): {decoder_option} - - hw_accel (str or None, optional): {hw_accel} - - filter_desc (str or None, optional): Filter description. - The list of available filters can be found at - https://ffmpeg.org/ffmpeg-filters.html - Note that complex filters are not supported. - """ - i = self.default_video_stream if stream_index is None else stream_index - if i is None: - raise RuntimeError("There is no video stream.") - self._be.add_video_stream( - i, - frames_per_chunk, - buffer_chunk_size, - filter_desc, - decoder, - decoder_option or {}, - hw_accel, - ) - - def remove_stream(self, i: int): - """Remove an output stream. - - Args: - i (int): Index of the output stream to be removed. - """ - self._be.remove_stream(i) - - def process_packet(self, timeout: Optional[float] = None, backoff: float = 10.0) -> int: - """Read the source media and process one packet. - - If a packet is read successfully, then the data in the packet will - be decoded and passed to corresponding output stream processors. - - If the packet belongs to a source stream that is not connected to - an output stream, then the data are discarded. - - When the source reaches EOF, then it triggers all the output stream - processors to enter drain mode. All the output stream processors - flush the pending frames. - - Args: - timeout (float or None, optional): Timeout in milli seconds. - - This argument changes the retry behavior when it failed to - process a packet due to the underlying media resource being - temporarily unavailable. - - When using a media device such as a microphone, there are cases - where the underlying buffer is not ready. - Calling this function in such case would cause the system to report - `EAGAIN (resource temporarily unavailable)`. - - * ``>=0``: Keep retrying until the given time passes. - - * ``0<``: Keep retrying forever. - - * ``None`` : No retrying and raise an exception immediately. - - Default: ``None``. - - Note: - - The retry behavior is applicable only when the reason is the - unavailable resource. It is not invoked if the reason of failure is - other. - - backoff (float, optional): Time to wait before retrying in milli seconds. - - This option is effective only when `timeout` is effective. (not ``None``) - - When `timeout` is effective, this `backoff` controls how long the function - should wait before retrying. Default: ``10.0``. - - Returns: - int: - ``0`` - A packet was processed properly. The caller can keep - calling this function to buffer more frames. - - ``1`` - The streamer reached EOF. All the output stream processors - flushed the pending frames. The caller should stop calling - this method. - """ - return self._be.process_packet(timeout, backoff) - - def process_all_packets(self): - """Process packets until it reaches EOF.""" - self._be.process_all_packets() - - def is_buffer_ready(self) -> bool: - """Returns true if all the output streams have at least one chunk filled.""" - return self._be.is_buffer_ready() - - def pop_chunks(self) -> Tuple[Optional[ChunkTensor]]: - """Pop one chunk from all the output stream buffers. - - Returns: - Tuple[Optional[ChunkTensor]]: - Buffer contents. - If a buffer does not contain any frame, then `None` is returned instead. - """ - ret = [] - for chunk in self._be.pop_chunks(): - if chunk is None: - ret.append(None) - else: - ret.append(ChunkTensor(chunk.frames, chunk.pts)) - return ret - - def fill_buffer(self, timeout: Optional[float] = None, backoff: float = 10.0) -> int: - """Keep processing packets until all buffers have at least one chunk - - Arguments: - timeout (float or None, optional): See - :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``None``) - - backoff (float, optional): See - :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``10.0``) - - Returns: - int: - ``0`` - Packets are processed properly and buffers are - ready to be popped once. - - ``1`` - The streamer reached EOF. All the output stream processors - flushed the pending frames. The caller should stop calling - this method. - """ - return self._be.fill_buffer(timeout, backoff) - - def stream( - self, timeout: Optional[float] = None, backoff: float = 10.0 - ) -> Iterator[Tuple[Optional[ChunkTensor], ...]]: - """Return an iterator that generates output tensors - - Arguments: - timeout (float or None, optional): See - :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``None``) - - backoff (float, optional): See - :py:func:`~StreamingMediaDecoder.process_packet`. (Default: ``10.0``) - - Returns: - Iterator[Tuple[Optional[ChunkTensor], ...]]: - Iterator that yields a tuple of chunks that correspond to the output - streams defined by client code. - If an output stream is exhausted, then the chunk Tensor is substituted - with ``None``. - The iterator stops if all the output streams are exhausted. - """ - if self.num_out_streams == 0: - raise RuntimeError("No output stream is configured.") - - while True: - if self.fill_buffer(timeout, backoff): - break - yield self.pop_chunks() - - while True: - chunks = self.pop_chunks() - if all(c is None for c in chunks): - return - yield chunks diff --git a/src/torio/io/_streaming_media_encoder.py b/src/torio/io/_streaming_media_encoder.py deleted file mode 100644 index bfbfe8791b..0000000000 --- a/src/torio/io/_streaming_media_encoder.py +++ /dev/null @@ -1,502 +0,0 @@ -from dataclasses import dataclass -from pathlib import Path -from typing import BinaryIO, Dict, Optional, Union - -import torch -import torio - -ffmpeg_ext = torio._extension.lazy_import_ffmpeg_ext() - - -@dataclass -class CodecConfig: - """Codec configuration.""" - - bit_rate: int = -1 - """Bit rate""" - - compression_level: int = -1 - """Compression level""" - - qscale: Optional[int] = None - """Global quality factor. Enables variable bit rate. Valid values depend on encoder. - - For example: MP3 takes ``0`` - ``9`` (https://trac.ffmpeg.org/wiki/Encode/MP3) while - libvorbis takes ``-1`` - ``10``. - """ - - gop_size: int = -1 - """The number of pictures in a group of pictures, or 0 for intra_only""" - - max_b_frames: int = -1 - """maximum number of B-frames between non-B-frames.""" - - -def _convert_config(cfg: CodecConfig): - if cfg is None: - return None - # Convert the codecconfig to C++ compatible type. - # omitting the return type annotation so as not to access ffmpeg_ext here. - return ffmpeg_ext.CodecConfig( - cfg.bit_rate, - cfg.compression_level, - cfg.qscale, - cfg.gop_size, - cfg.max_b_frames, - ) - - -def _format_doc(**kwargs): - def decorator(obj): - obj.__doc__ = obj.__doc__.format(**kwargs) - return obj - - return decorator - - -_encoder = """The name of the encoder to be used. - When provided, use the specified encoder instead of the default one. - - To list the available encoders, please use - :py:func:`~torio.utils.ffmpeg_utils.get_audio_encoders` for audio, and - :py:func:`~torio.utils.ffmpeg_utils.get_video_encoders` for video. - - Default: ``None``.""" - - -_encoder_option = """Options passed to encoder. - Mapping from str to str. - - To list encoder options for a encoder, you can use - ``ffmpeg -h encoder=`` command. - - Default: ``None``. - - | - - In addition to encoder-specific options, you can also pass options related - to multithreading. They are effective only if the encoder support them. - If neither of them are provided, StreamReader defaults to single thread. - - ``"threads"``: The number of threads (in str). - Providing the value ``"0"`` will let FFmpeg decides based on its heuristics. - - ``"thread_type"``: Which multithreading method to use. - The valid values are ``"frame"`` or ``"slice"``. - Note that each encoder supports different set of methods. - If not provided, a default value is used. - - - ``"frame"``: Encode more than one frame at once. - Each thread handles one frame. - This will increase decoding delay by one frame per thread - - ``"slice"``: Encode more than one part of a single frame at once. - - | - """ - - -_encoder_format = """Format used to encode media. - When encoder supports multiple formats, passing this argument will override - the format used for encoding. - - To list supported formats for the encoder, you can use - ``ffmpeg -h encoder=`` command. - - Default: ``None``. - - Note: - When ``encoder_format`` option is not provided, encoder uses its default format. - - For example, when encoding audio into wav format, 16-bit signed integer is used, - and when encoding video into mp4 format (h264 encoder), one of YUV format is used. - - This is because typically, 32-bit or 16-bit floating point is used in audio models but - they are not commonly used in audio formats. Similarly, RGB24 is commonly used in vision - models, but video formats usually (and better) support YUV formats. - """ - -_codec_config = """Codec configuration. Please refer to :py:class:`CodecConfig` for - configuration options. - - Default: ``None``.""" - - -_filter_desc = """Additional processing to apply before encoding the input media. - """ - -_format_common_args = _format_doc( - encoder=_encoder, - encoder_option=_encoder_option, - encoder_format=_encoder_format, - codec_config=_codec_config, - filter_desc=_filter_desc, -) - - -class StreamingMediaEncoder: - """Encode and write audio/video streams chunk by chunk - - Args: - dst (str, path-like or file-like object): The destination where the encoded data are written. - If string-type, it must be a resource indicator that FFmpeg can - handle. The supported value depends on the FFmpeg found in the system. - - If file-like object, it must support `write` method with the signature - `write(data: bytes) -> int`. - - Please refer to the following for the expected signature and behavior of - `write` method. - - - https://docs.python.org/3/library/io.html#io.BufferedIOBase.write - - format (str or None, optional): - Override the output format, or specify the output media device. - Default: ``None`` (no override nor device output). - - This argument serves two different use cases. - - 1) Override the output format. - This is useful when writing raw data or in a format different from the extension. - - 2) Specify the output device. - This allows to output media streams to hardware devices, - such as speaker and video screen. - - .. note:: - - This option roughly corresponds to ``-f`` option of ``ffmpeg`` command. - Please refer to the ffmpeg documentations for possible values. - - https://ffmpeg.org/ffmpeg-formats.html#Muxers - - Please use :py:func:`~torio.utils.ffmpeg_utils.get_muxers` to list the - multiplexers available in the current environment. - - For device access, the available values vary based on hardware (AV device) and - software configuration (ffmpeg build). - Please refer to the ffmpeg documentations for possible values. - - https://ffmpeg.org/ffmpeg-devices.html#Output-Devices - - Please use :py:func:`~torio.utils.ffmpeg_utils.get_output_devices` to list - the output devices available in the current environment. - - buffer_size (int): - The internal buffer size in byte. Used only when `dst` is a file-like object. - - Default: `4096`. - """ - - def __init__( - self, - dst: Union[str, Path, BinaryIO], - format: Optional[str] = None, - buffer_size: int = 4096, - ): - if hasattr(dst, "write"): - self._s = ffmpeg_ext.StreamingMediaEncoderFileObj(dst, format, buffer_size) - else: - self._s = ffmpeg_ext.StreamingMediaEncoder(str(dst), format) - self._is_open = False - - @_format_common_args - def add_audio_stream( - self, - sample_rate: int, - num_channels: int, - format: str = "flt", - *, - encoder: Optional[str] = None, - encoder_option: Optional[Dict[str, str]] = None, - encoder_sample_rate: Optional[int] = None, - encoder_num_channels: Optional[int] = None, - encoder_format: Optional[str] = None, - codec_config: Optional[CodecConfig] = None, - filter_desc: Optional[str] = None, - ): - """Add an output audio stream. - - Args: - sample_rate (int): The sample rate. - - num_channels (int): The number of channels. - - format (str, optional): Input sample format, which determines the dtype - of the input tensor. - - - ``"u8"``: The input tensor must be ``torch.uint8`` type. - - ``"s16"``: The input tensor must be ``torch.int16`` type. - - ``"s32"``: The input tensor must be ``torch.int32`` type. - - ``"s64"``: The input tensor must be ``torch.int64`` type. - - ``"flt"``: The input tensor must be ``torch.float32`` type. - - ``"dbl"``: The input tensor must be ``torch.float64`` type. - - Default: ``"flt"``. - - encoder (str or None, optional): {encoder} - - encoder_option (dict or None, optional): {encoder_option} - - encoder_sample_rate (int or None, optional): Override the sample rate used for encoding time. - Some encoders pose restriction on the sample rate used for encoding. - If the source sample rate is not supported by the encoder, the source sample rate is used, - otherwise a default one is picked. - - For example, ``"opus"`` encoder only supports 48k Hz, so, when encoding a - waveform with ``"opus"`` encoder, it is always encoded as 48k Hz. - Meanwhile ``"mp3"`` (``"libmp3lame"``) supports 44.1k, 48k, 32k, 22.05k, - 24k, 16k, 11.025k, 12k and 8k Hz. - If the original sample rate is one of these, then the original sample rate - is used, otherwise it will be resampled to a default one (44.1k). - When encoding into WAV format, there is no restriction on sample rate, - so the original sample rate will be used. - - Providing ``encoder_sample_rate`` will override this behavior and - make encoder attempt to use the provided sample rate. - The provided value must be one support by the encoder. - - encoder_num_channels (int or None, optional): Override the number of channels used for encoding. - - Similar to sample rate, some encoders (such as ``"opus"``, - ``"vorbis"`` and ``"g722"``) pose restriction on - the numbe of channels that can be used for encoding. - - If the original number of channels is supported by encoder, - then it will be used, otherwise, the encoder attempts to - remix the channel to one of the supported ones. - - Providing ``encoder_num_channels`` will override this behavior and - make encoder attempt to use the provided number of channels. - The provided value must be one support by the encoder. - - encoder_format (str or None, optional): {encoder_format} - - codec_config (CodecConfig or None, optional): {codec_config} - - filter_desc (str or None, optional): {filter_desc} - """ - self._s.add_audio_stream( - sample_rate, - num_channels, - format, - encoder, - encoder_option, - encoder_format, - encoder_sample_rate, - encoder_num_channels, - _convert_config(codec_config), - filter_desc, - ) - - @_format_common_args - def add_video_stream( - self, - frame_rate: float, - width: int, - height: int, - format: str = "rgb24", - *, - encoder: Optional[str] = None, - encoder_option: Optional[Dict[str, str]] = None, - encoder_frame_rate: Optional[float] = None, - encoder_width: Optional[int] = None, - encoder_height: Optional[int] = None, - encoder_format: Optional[str] = None, - codec_config: Optional[CodecConfig] = None, - filter_desc: Optional[str] = None, - hw_accel: Optional[str] = None, - ): - """Add an output video stream. - - This method has to be called before `open` is called. - - Args: - frame_rate (float): Frame rate of the video. - - width (int): Width of the video frame. - - height (int): Height of the video frame. - - format (str, optional): Input pixel format, which determines the - color channel order of the input tensor. - - - ``"gray8"``: One channel, grayscale. - - ``"rgb24"``: Three channels in the order of RGB. - - ``"bgr24"``: Three channels in the order of BGR. - - ``"yuv444p"``: Three channels in the order of YUV. - - Default: ``"rgb24"``. - - In either case, the input tensor has to be ``torch.uint8`` type and - the shape must be (frame, channel, height, width). - - encoder (str or None, optional): {encoder} - - encoder_option (dict or None, optional): {encoder_option} - - encoder_frame_rate (float or None, optional): Override the frame rate used for encoding. - - Some encoders, (such as ``"mpeg1"`` and ``"mpeg2"``) pose restriction on the - frame rate that can be used for encoding. - If such case, if the source frame rate (provided as ``frame_rate``) is not - one of the supported frame rate, then a default one is picked, and the frame rate - is changed on-the-fly. Otherwise the source frame rate is used. - - Providing ``encoder_frame_rate`` will override this behavior and - make encoder attempts to use the provided sample rate. - The provided value must be one support by the encoder. - - encoder_width (int or None, optional): Width of the image used for encoding. - This allows to change the image size during encoding. - - encoder_height (int or None, optional): Height of the image used for encoding. - This allows to change the image size during encoding. - - encoder_format (str or None, optional): {encoder_format} - - codec_config (CodecConfig or None, optional): {codec_config} - - filter_desc (str or None, optional): {filter_desc} - - hw_accel (str or None, optional): Enable hardware acceleration. - - When video is encoded on CUDA hardware, for example - `encoder="h264_nvenc"`, passing CUDA device indicator to `hw_accel` - (i.e. `hw_accel="cuda:0"`) will make StreamingMediaEncoder expect video - chunk to be CUDA Tensor. Passing CPU Tensor will result in an error. - - If `None`, the video chunk Tensor has to be CPU Tensor. - Default: ``None``. - """ - self._s.add_video_stream( - frame_rate, - width, - height, - format, - encoder, - encoder_option, - encoder_format, - encoder_frame_rate, - encoder_width, - encoder_height, - hw_accel, - _convert_config(codec_config), - filter_desc, - ) - - def set_metadata(self, metadata: Dict[str, str]): - """Set file-level metadata - - Args: - metadata (dict or None, optional): File-level metadata. - """ - self._s.set_metadata(metadata) - - def _print_output_stream(self, i: int): - """[debug] Print the registered stream information to stdout.""" - self._s.dump_format(i) - - def open(self, option: Optional[Dict[str, str]] = None) -> "StreamingMediaEncoder": - """Open the output file / device and write the header. - - :py:class:`StreamingMediaEncoder` is also a context manager and therefore supports the - ``with`` statement. - This method returns the instance on which the method is called (i.e. `self`), - so that it can be used in `with` statement. - It is recommended to use context manager, as the file is closed automatically - when exiting from ``with`` clause. - - Args: - option (dict or None, optional): Private options for protocol, device and muxer. See example. - - Example - Protocol option - >>> s = StreamingMediaEncoder(dst="rtmp://localhost:1234/live/app", format="flv") - >>> s.add_video_stream(...) - >>> # Passing protocol option `listen=1` makes StreamingMediaEncoder act as RTMP server. - >>> with s.open(option={"listen": "1"}) as f: - >>> f.write_video_chunk(...) - - Example - Device option - >>> s = StreamingMediaEncoder("-", format="sdl") - >>> s.add_video_stream(..., encoder_format="rgb24") - >>> # Open SDL video player with fullscreen - >>> with s.open(option={"window_fullscreen": "1"}): - >>> f.write_video_chunk(...) - - Example - Muxer option - >>> s = StreamingMediaEncoder("foo.flac") - >>> s.add_audio_stream(...) - >>> s.set_metadata({"artist": "torio contributors"}) - >>> # FLAC muxer has a private option to not write the header. - >>> # The resulting file does not contain the above metadata. - >>> with s.open(option={"write_header": "false"}) as f: - >>> f.write_audio_chunk(...) - """ - if not self._is_open: - self._s.open(option) - self._is_open = True - return self - - def close(self): - """Close the output - - :py:class:`StreamingMediaEncoder` is also a context manager and therefore supports the - ``with`` statement. - It is recommended to use context manager, as the file is closed automatically - when exiting from ``with`` clause. - - See :py:meth:`StreamingMediaEncoder.open` for more detail. - """ - if self._is_open: - self._s.close() - self._is_open = False - - def write_audio_chunk(self, i: int, chunk: torch.Tensor, pts: Optional[float] = None): - """Write audio data - - Args: - i (int): Stream index. - chunk (Tensor): Waveform tensor. Shape: `(frame, channel)`. - The ``dtype`` must match what was passed to :py:meth:`add_audio_stream` method. - pts (float, optional, or None): If provided, overwrite the presentation timestamp. - - .. note:: - - The provided value is converted to integer value expressed in basis of - sample rate. Therefore, it is truncated to the nearest value of - ``n / sample_rate``. - """ - self._s.write_audio_chunk(i, chunk, pts) - - def write_video_chunk(self, i: int, chunk: torch.Tensor, pts: Optional[float] = None): - """Write video/image data - - Args: - i (int): Stream index. - chunk (Tensor): Video/image tensor. - Shape: `(time, channel, height, width)`. - The ``dtype`` must be ``torch.uint8``. - The shape (height, width and the number of channels) must match - what was configured when calling :py:meth:`add_video_stream` - pts (float, optional or None): If provided, overwrite the presentation timestamp. - - .. note:: - - The provided value is converted to integer value expressed in basis of - frame rate. Therefore, it is truncated to the nearest value of - ``n / frame_rate``. - """ - self._s.write_video_chunk(i, chunk, pts) - - def flush(self): - """Flush the frames from encoders and write the frames to the destination.""" - self._s.flush() - - def __enter__(self): - """Context manager so that the destination is closed and data are flushed automatically.""" - return self - - def __exit__(self, exception_type, exception_value, traceback): - """Context manager so that the destination is closed and data are flushed automatically.""" - self.flush() - self.close() diff --git a/src/torio/lib/__init__.py b/src/torio/lib/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/torio/utils/__init__.py b/src/torio/utils/__init__.py deleted file mode 100644 index a3dbc29a6a..0000000000 --- a/src/torio/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from . import ffmpeg_utils - - -__all__ = ["ffmpeg_utils"] diff --git a/src/torio/utils/ffmpeg_utils.py b/src/torio/utils/ffmpeg_utils.py deleted file mode 100644 index a3f2232804..0000000000 --- a/src/torio/utils/ffmpeg_utils.py +++ /dev/null @@ -1,275 +0,0 @@ -"""Module to change the configuration of FFmpeg libraries (such as libavformat). - -It affects functionalities in :py:mod:`torio.io`. - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - Some APIs are deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. -""" -from typing import Dict, List, Tuple - -import torio - -ffmpeg_ext = torio._extension.lazy_import_ffmpeg_ext() - - -from torchaudio._internal.module_utils import dropping_support - - -@dropping_support -def get_versions() -> Dict[str, Tuple[int]]: - """Get the versions of FFmpeg libraries - - Returns: - dict: mapping from library names to version string, - i.e. `"libavutil": (56, 22, 100)`. - """ - return ffmpeg_ext.get_versions() - - -@dropping_support -def get_log_level() -> int: - """Get the log level of FFmpeg. - - See :py:func:`set_log_level` for the detail. - """ - return ffmpeg_ext.get_log_level() - - -@dropping_support -def set_log_level(level: int): - """Set the log level of FFmpeg (libavformat etc) - - Arguments: - level (int): Log level. The larger, the more verbose. - - The following values are common values, the corresponding ``ffmpeg``'s - ``-loglevel`` option value and desription. - - * ``-8`` (``quiet``): - Print no output. - * ``0`` (``panic``): - Something went really wrong and we will crash now. - * ``8`` (``fatal``): - Something went wrong and recovery is not possible. - For example, no header was found for a format which depends - on headers or an illegal combination of parameters is used. - * ``16`` (``error``): - Something went wrong and cannot losslessly be recovered. - However, not all future data is affected. - * ``24`` (``warning``): - Something somehow does not look correct. - This may or may not lead to problems. - * ``32`` (``info``): - Standard information. - * ``40`` (``verbose``): - Detailed information. - * ``48`` (``debug``): - Stuff which is only useful for libav* developers. - * ``56`` (``trace``): - Extremely verbose debugging, useful for libav* development. - - """ - ffmpeg_ext.set_log_level(level) - - -@dropping_support -def get_demuxers() -> Dict[str, str]: - """Get the available demuxers. - - Returns: - Dict[str, str]: Mapping from demuxer (format) short name to long name. - - Example - >>> for k, v in get_demuxers().items(): - >>> print(f"{k}: {v}") - ... aa: Audible AA format files - ... aac: raw ADTS AAC (Advanced Audio Coding) - ... aax: CRI AAX - ... ac3: raw AC-3 - """ - return ffmpeg_ext.get_demuxers() - - -@dropping_support -def get_muxers() -> Dict[str, str]: - """Get the available muxers. - - Returns: - Dict[str, str]: Mapping from muxer (format) short name to long name. - - Example - >>> for k, v in get_muxers().items(): - >>> print(f"{k}: {v}") - ... a64: a64 - video for Commodore 64 - ... ac3: raw AC-3 - ... adts: ADTS AAC (Advanced Audio Coding) - ... adx: CRI ADX - ... aiff: Audio IFF - """ - return ffmpeg_ext.get_muxers() - - -@dropping_support -def get_audio_decoders() -> Dict[str, str]: - """Get the available audio decoders. - - Returns: - Dict[str, str]: Mapping from decoder short name to long name. - - Example - >>> for k, v in get_audio_decoders().items(): - >>> print(f"{k}: {v}") - ... a64: a64 - video for Commodore 64 - ... ac3: raw AC-3 - ... adts: ADTS AAC (Advanced Audio Coding) - ... adx: CRI ADX - ... aiff: Audio IFF - """ - return ffmpeg_ext.get_audio_decoders() - - -@dropping_support -def get_audio_encoders() -> Dict[str, str]: - """Get the available audio encoders. - - Returns: - Dict[str, str]: Mapping from encoder short name to long name. - - Example - >>> for k, v in get_audio_encoders().items(): - >>> print(f"{k}: {v}") - ... comfortnoise: RFC 3389 comfort noise generator - ... s302m: SMPTE 302M - ... aac: AAC (Advanced Audio Coding) - ... ac3: ATSC A/52A (AC-3) - ... ac3_fixed: ATSC A/52A (AC-3) - ... alac: ALAC (Apple Lossless Audio Codec) - """ - return ffmpeg_ext.get_audio_encoders() - - -@dropping_support -def get_video_decoders() -> Dict[str, str]: - """Get the available video decoders. - - Returns: - Dict[str, str]: Mapping from decoder short name to long name. - - Example - >>> for k, v in get_video_decoders().items(): - >>> print(f"{k}: {v}") - ... aasc: Autodesk RLE - ... aic: Apple Intermediate Codec - ... alias_pix: Alias/Wavefront PIX image - ... agm: Amuse Graphics Movie - ... amv: AMV Video - ... anm: Deluxe Paint Animation - """ - return ffmpeg_ext.get_video_decoders() - - -@dropping_support -def get_video_encoders() -> Dict[str, str]: - """Get the available video encoders. - - Returns: - Dict[str, str]: Mapping from encoder short name to long name. - - Example - >>> for k, v in get_audio_encoders().items(): - >>> print(f"{k}: {v}") - ... a64multi: Multicolor charset for Commodore 64 - ... a64multi5: Multicolor charset for Commodore 64, extended with 5th color (colram) - ... alias_pix: Alias/Wavefront PIX image - ... amv: AMV Video - ... apng: APNG (Animated Portable Network Graphics) image - ... asv1: ASUS V1 - ... asv2: ASUS V2 - """ - return ffmpeg_ext.get_video_encoders() - - -@dropping_support -def get_input_devices() -> Dict[str, str]: - """Get the available input devices. - - Returns: - Dict[str, str]: Mapping from device short name to long name. - - Example - >>> for k, v in get_input_devices().items(): - >>> print(f"{k}: {v}") - ... avfoundation: AVFoundation input device - ... lavfi: Libavfilter virtual input device - """ - return ffmpeg_ext.get_input_devices() - - -@dropping_support -def get_output_devices() -> Dict[str, str]: - """Get the available output devices. - - Returns: - Dict[str, str]: Mapping from device short name to long name. - - Example - >>> for k, v in get_output_devices().items(): - >>> print(f"{k}: {v}") - ... audiotoolbox: AudioToolbox output device - """ - return ffmpeg_ext.get_output_devices() - - -@dropping_support -def get_input_protocols() -> List[str]: - """Get the supported input protocols. - - Returns: - List[str]: The names of supported input protocols - - Example - >>> print(get_input_protocols()) - ... ['file', 'ftp', 'hls', 'http','https', 'pipe', 'rtmp', 'tcp', 'tls', 'udp', 'unix'] - """ - return ffmpeg_ext.get_input_protocols() - - -@dropping_support -def get_output_protocols() -> List[str]: - """Get the supported output protocols. - - Returns: - list of str: The names of supported output protocols - - Example - >>> print(get_output_protocols()) - ... ['file', 'ftp', 'http', 'https', 'md5', 'pipe', 'prompeg', 'rtmp', 'tee', 'tcp', 'tls', 'udp', 'unix'] - """ - return ffmpeg_ext.get_output_protocols() - - -@dropping_support -def get_build_config() -> str: - """Get the FFmpeg build configuration - - Returns: - str: Build configuration string. - - Example - >>> print(get_build_config()) - --prefix=/Users/runner/miniforge3 --cc=arm64-apple-darwin20.0.0-clang --enable-gpl --enable-hardcoded-tables --enable-libfreetype --enable-libopenh264 --enable-neon --enable-libx264 --enable-libx265 --enable-libaom --enable-libsvtav1 --enable-libxml2 --enable-libvpx --enable-pic --enable-pthreads --enable-shared --disable-static --enable-version3 --enable-zlib --enable-libmp3lame --pkg-config=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/pkg-config --enable-cross-compile --arch=arm64 --target-os=darwin --cross-prefix=arm64-apple-darwin20.0.0- --host-cc=/Users/runner/miniforge3/conda-bld/ffmpeg_1646229390493/_build_env/bin/x86_64-apple-darwin13.4.0-clang # noqa - """ - return ffmpeg_ext.get_build_config() - - -@dropping_support -def clear_cuda_context_cache(): - """Clear the CUDA context used by CUDA Hardware accelerated video decoding""" - ffmpeg_ext.clear_cuda_context_cache() diff --git a/tools/setup_helpers/extension.py b/tools/setup_helpers/extension.py index 58f5087854..6352e2cda1 100644 --- a/tools/setup_helpers/extension.py +++ b/tools/setup_helpers/extension.py @@ -65,26 +65,6 @@ def get_ext_modules(): Extension(name="torchaudio.lib.pybind11_prefixctc", sources=[]), ] ) - if _USE_FFMPEG: - if "FFMPEG_ROOT" in os.environ: - # single version ffmpeg mode - modules.extend( - [ - Extension(name="torio.lib.libtorio_ffmpeg", sources=[]), - Extension(name="torio.lib._torio_ffmpeg", sources=[]), - ] - ) - else: - modules.extend( - [ - Extension(name="torio.lib.libtorio_ffmpeg4", sources=[]), - Extension(name="torio.lib._torio_ffmpeg4", sources=[]), - Extension(name="torio.lib.libtorio_ffmpeg5", sources=[]), - Extension(name="torio.lib._torio_ffmpeg5", sources=[]), - Extension(name="torio.lib.libtorio_ffmpeg6", sources=[]), - Extension(name="torio.lib._torio_ffmpeg6", sources=[]), - ] - ) return modules From d2ccd8259f23abe43407d084a5b2580016d54abf Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Mon, 11 Aug 2025 22:39:32 +0000 Subject: [PATCH 03/35] Remove libtorio ffmpeg from cmakelists --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddc6dc15a2..a94c197a7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -177,7 +177,6 @@ if (USE_FFMPEG) message(STATUS "Building FFmpeg integration with multi version support") add_subdirectory(third_party/ffmpeg/multi) endif() - add_subdirectory(src/libtorio/ffmpeg) endif() if (BUILD_CUDA_CTC_DECODER) if (NOT USE_CUDA) From 7b47628092f52856ac960cd488b469f511aded5b Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Mon, 11 Aug 2025 23:08:06 +0000 Subject: [PATCH 04/35] Remove io directory --- docs/source/io.rst | 1 - src/torchaudio/io/__init__.py | 20 -- src/torchaudio/io/_effector.py | 347 --------------------------------- src/torchaudio/io/_playback.py | 72 ------- 4 files changed, 440 deletions(-) delete mode 100644 src/torchaudio/io/__init__.py delete mode 100644 src/torchaudio/io/_effector.py delete mode 100644 src/torchaudio/io/_playback.py diff --git a/docs/source/io.rst b/docs/source/io.rst index 202214cd8d..11e3c0c32c 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -22,7 +22,6 @@ torchaudio.io StreamReader StreamWriter - AudioEffector play_audio .. rubric:: Tutorials using ``torchaudio.io`` diff --git a/src/torchaudio/io/__init__.py b/src/torchaudio/io/__init__.py deleted file mode 100644 index caf35c63f8..0000000000 --- a/src/torchaudio/io/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from torio.io import CodecConfig as _CodecConfig, StreamingMediaDecoder as _StreamReader, StreamingMediaEncoder as _StreamWriter -from torchaudio._internal.module_utils import dropping_class_io_support, dropping_class_support, dropping_io_support - -from ._effector import AudioEffector as _AudioEffector -from ._playback import play_audio as _play_audio - -CodecConfig = dropping_class_io_support(_CodecConfig) -StreamReader = dropping_class_io_support(_StreamReader) -StreamWriter = dropping_class_io_support(_StreamWriter) -AudioEffector = dropping_class_support(_AudioEffector) -play_audio = dropping_io_support(_play_audio) - - -__all__ = [ - "AudioEffector", - "StreamReader", - "StreamWriter", - "CodecConfig", - "play_audio", -] diff --git a/src/torchaudio/io/_effector.py b/src/torchaudio/io/_effector.py deleted file mode 100644 index 74255684c8..0000000000 --- a/src/torchaudio/io/_effector.py +++ /dev/null @@ -1,347 +0,0 @@ -import io -from typing import Iterator, List, Optional - -import torch -from torch import Tensor - -from torio.io._streaming_media_decoder import _get_afilter_desc, StreamingMediaDecoder as StreamReader -from torio.io._streaming_media_encoder import CodecConfig, StreamingMediaEncoder as StreamWriter - - -class _StreamingIOBuffer: - """Streaming Bytes IO buffer. Data are dropped when read.""" - - def __init__(self): - self._buffer: List(bytes) = [] - - def write(self, b: bytes): - if b: - self._buffer.append(b) - return len(b) - - def pop(self, n): - """Pop the oldest byte string. It does not necessary return the requested amount""" - if not self._buffer: - return b"" - if len(self._buffer[0]) <= n: - return self._buffer.pop(0) - ret = self._buffer[0][:n] - self._buffer[0] = self._buffer[0][n:] - return ret - - -def _get_sample_fmt(dtype: torch.dtype): - types = { - torch.uint8: "u8", - torch.int16: "s16", - torch.int32: "s32", - torch.float32: "flt", - torch.float64: "dbl", - } - if dtype not in types: - raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}") - return types[dtype] - - -class _AudioStreamingEncoder: - """Given a waveform, encode on-demand and return bytes""" - - def __init__( - self, - src: Tensor, - sample_rate: int, - effect: str, - muxer: str, - encoder: Optional[str], - codec_config: Optional[CodecConfig], - frames_per_chunk: int, - ): - self.src = src - self.buffer = _StreamingIOBuffer() - self.writer = StreamWriter(self.buffer, format=muxer) - self.writer.add_audio_stream( - num_channels=src.size(1), - sample_rate=sample_rate, - format=_get_sample_fmt(src.dtype), - encoder=encoder, - filter_desc=effect, - codec_config=codec_config, - ) - self.writer.open() - self.fpc = frames_per_chunk - - # index on the input tensor (along time-axis) - # we use -1 to indicate that we finished iterating the tensor and - # the writer is closed. - self.i_iter = 0 - - def read(self, n): - while not self.buffer._buffer and self.i_iter >= 0: - self.writer.write_audio_chunk(0, self.src[self.i_iter : self.i_iter + self.fpc]) - self.i_iter += self.fpc - if self.i_iter >= self.src.size(0): - self.writer.flush() - self.writer.close() - self.i_iter = -1 - return self.buffer.pop(n) - - -def _encode( - src: Tensor, - sample_rate: int, - effect: str, - muxer: str, - encoder: Optional[str], - codec_config: Optional[CodecConfig], -): - buffer = io.BytesIO() - writer = StreamWriter(buffer, format=muxer) - writer.add_audio_stream( - num_channels=src.size(1), - sample_rate=sample_rate, - format=_get_sample_fmt(src.dtype), - encoder=encoder, - filter_desc=effect, - codec_config=codec_config, - ) - with writer.open(): - writer.write_audio_chunk(0, src) - buffer.seek(0) - return buffer - - -def _get_muxer(dtype: torch.dtype): - # TODO: check if this works in Windows. - types = { - torch.uint8: "u8", - torch.int16: "s16le", - torch.int32: "s32le", - torch.float32: "f32le", - torch.float64: "f64le", - } - if dtype not in types: - raise ValueError(f"Unsupported dtype is provided {dtype}. Supported dtypes are: {types.keys()}") - return types[dtype] - - -class AudioEffector: - """Apply various filters and/or codecs to waveforms. - - .. versionadded:: 2.1 - - Args: - effect (str or None, optional): Filter expressions or ``None`` to apply no filter. - See https://ffmpeg.org/ffmpeg-filters.html#Audio-Filters for the - details of filter syntax. - - format (str or None, optional): When provided, encode the audio into the - corresponding format. Default: ``None``. - - encoder (str or None, optional): When provided, override the encoder used - by the ``format``. Default: ``None``. - - codec_config (CodecConfig or None, optional): When provided, configure the encoding codec. - Should be provided in conjunction with ``format`` option. - - pad_end (bool, optional): When enabled, and if the waveform becomes shorter after applying - effects/codec, then pad the end with silence. - - Example - Basic usage - To use ``AudioEffector``, first instantiate it with a set of - ``effect`` and ``format``. - - >>> # instantiate the effector - >>> effector = AudioEffector(effect=..., format=...) - - Then, use :py:meth:`~AudioEffector.apply` or :py:meth:`~AudioEffector.stream` - method to apply them. - - >>> # Apply the effect to the whole waveform - >>> applied = effector.apply(waveform, sample_rate) - - >>> # Apply the effect chunk-by-chunk - >>> for chunk in effector.stream(waveform, sample_rate): - >>> ... - - Example - Applying effects - Please refer to - https://ffmpeg.org/ffmpeg-filters.html#Filtergraph-description - for the overview of filter description, and - https://ffmpeg.org/ffmpeg-filters.html#toc-Audio-Filters - for the list of available filters. - - Tempo - https://ffmpeg.org/ffmpeg-filters.html#atempo - - >>> AudioEffector(effect="atempo=1.5") - - Echo - https://ffmpeg.org/ffmpeg-filters.html#aecho - - >>> AudioEffector(effect="aecho=0.8:0.88:60:0.4") - - Flanger - https://ffmpeg.org/ffmpeg-filters.html#flanger - - >>> AudioEffector(effect="aflanger") - - Vibrato - https://ffmpeg.org/ffmpeg-filters.html#vibrato - - >>> AudioEffector(effect="vibrato") - - Tremolo - https://ffmpeg.org/ffmpeg-filters.html#tremolo - - >>> AudioEffector(effect="vibrato") - - You can also apply multiple effects at once. - - >>> AudioEffector(effect="") - - Example - Applying codec - One can apply codec using ``format`` argument. ``format`` can be - audio format or container format. If the container format supports - multiple encoders, you can specify it with ``encoder`` argument. - - Wav format - (no compression is applied but samples are converted to - 16-bit signed integer) - - >>> AudioEffector(format="wav") - - Ogg format with default encoder - - >>> AudioEffector(format="ogg") - - Ogg format with vorbis - - >>> AudioEffector(format="ogg", encoder="vorbis") - - Ogg format with opus - - >>> AudioEffector(format="ogg", encoder="opus") - - Webm format with opus - - >>> AudioEffector(format="webm", encoder="opus") - - Example - Applying codec with configuration - Reference: https://trac.ffmpeg.org/wiki/Encode/MP3 - - MP3 with default config - - >>> AudioEffector(format="mp3") - - MP3 with variable bitrate - - >>> AudioEffector(format="mp3", codec_config=CodecConfig(qscale=5)) - - MP3 with constant bitrate - - >>> AudioEffector(format="mp3", codec_config=CodecConfig(bit_rate=32_000)) - """ - - def __init__( - self, - effect: Optional[str] = None, - format: Optional[str] = None, - *, - encoder: Optional[str] = None, - codec_config: Optional[CodecConfig] = None, - pad_end: bool = True, - ): - if format is None: - if encoder is not None or codec_config is not None: - raise ValueError("`encoder` and/or `condec_config` opions are provided without `format` option.") - self.effect = effect - self.format = format - self.encoder = encoder - self.codec_config = codec_config - self.pad_end = pad_end - - def _get_reader(self, waveform, sample_rate, output_sample_rate, frames_per_chunk=None): - num_frames, num_channels = waveform.shape - - if self.format is not None: - muxer = self.format - encoder = self.encoder - option = {} - # Some formats are headerless, so need to provide these infomation. - if self.format == "mulaw": - option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"} - - else: # PCM - muxer = _get_muxer(waveform.dtype) - encoder = None - option = {"sample_rate": f"{sample_rate}", "channels": f"{num_channels}"} - - if frames_per_chunk is None: - src = _encode(waveform, sample_rate, self.effect, muxer, encoder, self.codec_config) - else: - src = _AudioStreamingEncoder( - waveform, sample_rate, self.effect, muxer, encoder, self.codec_config, frames_per_chunk - ) - - output_sr = sample_rate if output_sample_rate is None else output_sample_rate - filter_desc = _get_afilter_desc(output_sr, _get_sample_fmt(waveform.dtype), num_channels) - if self.pad_end: - filter_desc = f"{filter_desc},apad=whole_len={num_frames}" - - reader = StreamReader(src, format=muxer, option=option) - reader.add_audio_stream(frames_per_chunk or -1, -1, filter_desc=filter_desc) - return reader - - def apply(self, waveform: Tensor, sample_rate: int, output_sample_rate: Optional[int] = None) -> Tensor: - """Apply the effect and/or codecs to the whole tensor. - - Args: - waveform (Tensor): The input waveform. Shape: ``(time, channel)`` - sample_rate (int): Sample rate of the input waveform. - output_sample_rate (int or None, optional): Output sample rate. - If provided, override the output sample rate. - Otherwise, the resulting tensor is resampled to have - the same sample rate as the input. - Default: ``None``. - - Returns: - Tensor: - Resulting Tensor. Shape: ``(time, channel)``. The number of frames - could be different from that of the input. - """ - if waveform.ndim != 2: - raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}") - - if waveform.numel() == 0: - return waveform - - reader = self._get_reader(waveform, sample_rate, output_sample_rate) - reader.process_all_packets() - (applied,) = reader.pop_chunks() - return Tensor(applied) - - def stream( - self, waveform: Tensor, sample_rate: int, frames_per_chunk: int, output_sample_rate: Optional[int] = None - ) -> Iterator[Tensor]: - """Apply the effect and/or codecs to the given tensor chunk by chunk. - - Args: - waveform (Tensor): The input waveform. Shape: ``(time, channel)`` - sample_rate (int): Sample rate of the waveform. - frames_per_chunk (int): The number of frames to return at a time. - output_sample_rate (int or None, optional): Output sample rate. - If provided, override the output sample rate. - Otherwise, the resulting tensor is resampled to have - the same sample rate as the input. - Default: ``None``. - - Returns: - Iterator[Tensor]: - Series of processed chunks. Shape: ``(time, channel)``, where the - the number of frames matches ``frames_per_chunk`` except the - last chunk, which could be shorter. - """ - if waveform.ndim != 2: - raise ValueError(f"Expected the input waveform to be 2D. Found: {waveform.ndim}") - - if waveform.numel() == 0: - return waveform - - reader = self._get_reader(waveform, sample_rate, output_sample_rate, frames_per_chunk) - for (applied,) in reader.stream(): - yield Tensor(applied) diff --git a/src/torchaudio/io/_playback.py b/src/torchaudio/io/_playback.py deleted file mode 100644 index 7183ee3ba8..0000000000 --- a/src/torchaudio/io/_playback.py +++ /dev/null @@ -1,72 +0,0 @@ -import warnings -from sys import platform -from typing import Optional - -import torch -import torchaudio - -dict_format = { - torch.uint8: "u8", - torch.int16: "s16", - torch.int32: "s32", - torch.int64: "s64", - torch.float32: "flt", - torch.float64: "dbl", -} - - -def play_audio( - waveform: torch.Tensor, - sample_rate: Optional[float], - device: Optional[str] = None, -) -> None: - """Plays audio through specified or available output device. - - .. warning:: - This function is currently only supported on MacOS, and requires - libavdevice (FFmpeg) with ``audiotoolbox`` output device. - - .. note:: - This function can play up to two audio channels. - - Args: - waveform: Tensor containing the audio to play. - Expected shape: `(time, num_channels)`. - sample_rate: Sample rate of the audio to play. - device: Output device to use. If None, the default device is used. - """ - - if platform == "darwin": - device = device or "audiotoolbox" - path = "-" - else: - raise ValueError(f"This function only supports MacOS, but current OS is {platform}") - - available_devices = list(torchaudio.utils.ffmpeg_utils.get_output_devices().keys()) - if device not in available_devices: - raise ValueError(f"Device {device} is not available. Available devices are: {available_devices}") - - if waveform.dtype not in dict_format: - raise ValueError(f"Unsupported type {waveform.dtype}. The list of supported types is: {dict_format.keys()}") - format = dict_format[waveform.dtype] - - if waveform.ndim != 2: - raise ValueError(f"Expected 2D tensor with shape `(time, num_channels)`, got {waveform.ndim}D tensor instead") - - time, num_channels = waveform.size() - if num_channels > 2: - warnings.warn( - f"Expected up to 2 channels, got {num_channels} channels instead. " - "Only the first 2 channels will be played.", - stacklevel=2, - ) - - # Write to speaker device - s = torchaudio.io.StreamWriter(dst=path, format=device) - s.add_audio_stream(sample_rate, num_channels, format=format) - - # write audio to the device - block_size = 256 - with s.open(): - for i in range(0, time, block_size): - s.write_audio_chunk(0, waveform[i : i + block_size, :]) From a3002211592397a4a4aa507f7ebd0626bd125231 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jul 2025 10:18:18 +0100 Subject: [PATCH 05/35] Let load and save rely on *_with_torchcodec --- src/torchaudio/__init__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index e533cafe9d..1fde90b871 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -7,8 +7,6 @@ get_audio_backend as _get_audio_backend, info as _info, list_audio_backends as _list_audio_backends, - load, - save, set_audio_backend as _set_audio_backend, ) from ._torchcodec import load_with_torchcodec, save_with_torchcodec @@ -41,6 +39,13 @@ pass +def load(*args, **kwargs): + return load_with_torchcodec(*args, **kwargs) + +def save(*args, **kwargs): + return save_with_torchcodec(*args, **kwargs) + + __all__ = [ "AudioMetaData", "load", From 07e3b77f565d153ec3c8d6eb2cba3de93bd8c1dd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 16 Jul 2025 13:49:53 +0100 Subject: [PATCH 06/35] install torchcodec in doc job --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From 92719d3abe1c206f8f3b0a6e3531a53e0ef30933 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 12 Aug 2025 19:53:00 +0000 Subject: [PATCH 07/35] Add docstring and arguments for load and save --- src/torchaudio/__init__.py | 177 ++++++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 4 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1fde90b871..ed4be65d6d 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -39,12 +39,181 @@ pass -def load(*args, **kwargs): - return load_with_torchcodec(*args, **kwargs) +def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. -def save(*args, **kwargs): - return save_with_torchcodec(*args, **kwargs) + .. note:: + This function supports the same API as :func:`~torchaudio.load`, and + relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + In TorchAudio 2.9, :func:`~torchaudio.load` will be relying on + :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of + :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and + ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + +def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, +) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + This function supports the same API as :func:`~torchaudio.save`, and + relies on TorchCodec's encoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + In TorchAudio 2.9, :func:`~torchaudio.save` will be relying on + :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of + :func:`~torchaudio.save`, like ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by + are ignored by :func:`~torchaudio.save_with_torchcodec`. + + This function provides a TorchCodec-based alternative to torchaudio.save + with the same API. TorchCodec's AudioEncoder provides efficient encoding + with FFmpeg under the hood. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From 4a98ee5f36552ead8e3cf6bf143f7b4484dd897c Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 14:42:00 +0000 Subject: [PATCH 08/35] Revise docstring --- src/torchaudio/__init__.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index ed4be65d6d..37d20a76aa 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -53,16 +53,13 @@ def load( .. note:: - This function supports the same API as :func:`~torchaudio.load`, and - relies on TorchCodec's decoding capabilities under the hood. It is + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is provided for convenience, but we do recommend that you port your code to natively use ``torchcodec``'s ``AudioDecoder`` class for better performance: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - In TorchAudio 2.9, :func:`~torchaudio.load` will be relying on - :func:`~torchaudio.load_with_torchcodec`. Note that some parameters of - :func:`~torchaudio.load`, like ``normalize``, ``buffer_size``, and - ``backend``, are ignored by :func:`~torchaudio.load_with_torchcodec`. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. Args: @@ -136,21 +133,14 @@ def save( .. note:: - This function supports the same API as :func:`~torchaudio.save`, and - relies on TorchCodec's encoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to natively use ``torchcodec``'s ``AudioEncoder`` class for better performance: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - In TorchAudio 2.9, :func:`~torchaudio.save` will be relying on - :func:`~torchaudio.save_with_torchcodec`. Note that some parameters of - :func:`~torchaudio.save`, like ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored by - are ignored by :func:`~torchaudio.save_with_torchcodec`. - - This function provides a TorchCodec-based alternative to torchaudio.save - with the same API. TorchCodec's AudioEncoder provides efficient encoding - with FFmpeg under the hood. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. Args: uri (path-like object): From 7b02754b407e42cca822d3d2ce5e7eeb60d2b01f Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 15:13:14 +0000 Subject: [PATCH 09/35] Add typing imports --- src/torchaudio/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 37d20a76aa..60c8ceb7fe 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -1,4 +1,7 @@ from torchaudio._internal.module_utils import dropping_io_support, dropping_class_io_support +from typing import Union, BinaryIO, Optional, Tuple +import os +import torch # Initialize extension and backend first from . import _extension # noqa # usort: skip From 74edc0a8dbe942aae3f04924d1743f4da49800cb Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 16:00:40 +0000 Subject: [PATCH 10/35] Try ffmpeg>4 --- .github/scripts/unittest-linux/install.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index a7ae9bfcf4..2163502b2e 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -86,8 +86,7 @@ pip install . -v --no-build-isolation # 3. Install Test tools printf "* Installing test tools\n" -# On this CI, for whatever reason, we're only able to install ffmpeg 4. -conda install -y "ffmpeg<5" +conda install -y "ffmpeg>4" python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" From 80f5eb7778afd5efc1a2c601583c84ffb5aa2401 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 16:22:24 +0000 Subject: [PATCH 11/35] Install conda deps before pip deps --- .github/scripts/unittest-linux/install.sh | 30 ++++++++++++----------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 2163502b2e..6a347577d5 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,20 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - - -# 2. Install torchaudio -conda install --quiet -y ninja cmake - -printf "* Installing torchaudio\n" -export BUILD_CPP_TEST=1 -pip install . -v --no-build-isolation -# 3. Install Test tools -printf "* Installing test tools\n" -conda install -y "ffmpeg>4" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -97,12 +84,27 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} libvorbis parameterized 'requests>=2.20' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} "ffmpeg>4" libvorbis parameterized 'requests>=2.20' pip install SoundFile coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above pip install pillow scipy "numpy>=1.26" ) + +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + + +# 2. Install torchaudio +conda install --quiet -y ninja cmake + +printf "* Installing torchaudio\n" +export BUILD_CPP_TEST=1 +pip install . -v --no-build-isolation + +# 3. Install Test tools +printf "* Installing test tools\n" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" + # Install fairseq git clone https://github.com/pytorch/fairseq cd fairseq From 7f063a6ce08b442de93471f8891e88e65544e0b3 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 18:11:05 +0000 Subject: [PATCH 12/35] Add scipy hack for load and save --- src/torchaudio/__init__.py | 369 ++++++++++++++++++++----------------- 1 file changed, 203 insertions(+), 166 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 60c8ceb7fe..5910743607 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,6 +2,8 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch +from scipy.io import wavfile +import sys # Initialize extension and backend first from . import _extension # noqa # usort: skip @@ -41,172 +43,207 @@ except ImportError: pass - -def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from source using TorchCodec's AudioDecoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioDecoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and - ``backend`` are ignored and accepted only for backwards compatibility. - - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method. - - frame_offset (int, optional): - Number of samples to skip before start reading data. - num_frames (int, optional): - Maximum number of samples to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - normalize (bool, optional): - TorchCodec always returns normalized float32 samples. This parameter - is ignored and a warning is issued if set to False. - Default: ``True``. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Format hint for the decoder. May not be supported by all TorchCodec - decoders. (Default: ``None``) - buffer_size (int, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - backend (str or None, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - Always returns float32 tensors. If ``channels_first=True``, shape is - `[channel, time]`, otherwise `[time, channel]`. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If unsupported parameters are used. - RuntimeError: If TorchCodec fails to decode the audio. - - Note: - - TorchCodec always returns normalized float32 samples, so the ``normalize`` - parameter has no effect. - - The ``buffer_size`` and ``backend`` parameters are ignored. - - Not all audio formats supported by torchaudio backends may be supported - by TorchCodec. - """ - return load_with_torchcodec( - uri, - frame_offset=frame_offset, - num_frames=num_frames, - normalize=normalize, - channels_first=channels_first, - format=format, - buffer_size=buffer_size, - backend=backend - ) - -def save( - uri: Union[str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, -) -> None: - """Save audio data to file using TorchCodec's AudioEncoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. - It is provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioEncoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for - backwards compatibility. - - Args: - uri (path-like object): - Path to save the audio file. The file extension determines the format. - - src (torch.Tensor): - Audio data to save. Must be a 1D or 2D tensor with float32 values - in the range [-1, 1]. If 2D, shape should be [channel, time] when - channels_first=True, or [time, channel] when channels_first=False. - - sample_rate (int): - Sample rate of the audio data. - - channels_first (bool, optional): - Indicates whether the input tensor has channels as the first dimension. - If True, expects [channel, time]. If False, expects [time, channel]. - Default: True. - - format (str or None, optional): - Audio format hint. Not used by TorchCodec (format is determined by - file extension). A warning is issued if provided. - Default: None. - - encoding (str or None, optional): - Audio encoding. Not fully supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - bits_per_sample (int or None, optional): - Bits per sample. Not directly supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - buffer_size (int, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if not default value. Default: 4096. - - backend (str or None, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if provided. Default: None. - - compression (float, int or None, optional): - Compression level or bit rate. Maps to bit_rate parameter in - TorchCodec AudioEncoder. Default: None. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If input parameters are invalid. - RuntimeError: If TorchCodec fails to encode the audio. - - Note: - - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. - - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) - are not used by TorchCodec but are provided for API compatibility. - - The output format is determined by the file extension in the uri. - - TorchCodec uses FFmpeg under the hood for encoding. - """ - return save_with_torchcodec(uri, src, sample_rate, - channels_first=channels_first, - format=format, - encoding=encoding, - bits_per_sample=bits_per_sample, - buffer_size=buffer_size, - backend=backend, - compression=compression) +# CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack +# allows CI to build with ffmpeg4 and works around load/test bugginess. +if "pytest" in sys.modules: + def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + ) -> Tuple[torch.Tensor, int]: + rate, data = wavfile.read(uri) + if data.ndim == 1: + data = data[:,None] + if num_frames == -1: + num_frames = data.shape[0] - frame_offset + data = data[frame_offset:frame_offset + num_frames] + if channels_first: + data = data.T + return data, rate + + def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, + ): + wavfile.write(uri, sample_rate, src.numpy()) +else: + def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + ) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + + def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, + ) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From 700c6c9b0a36efc2a8bdeb8c348a84707e67edff Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:17:46 +0000 Subject: [PATCH 13/35] Only import scipy during testing --- .github/scripts/unittest-linux/install.sh | 1 - src/torchaudio/__init__.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 6a347577d5..e4fa67b1e5 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -93,7 +93,6 @@ fi pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - # 2. Install torchaudio conda install --quiet -y ninja cmake diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 5910743607..ca34b996cf 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -2,7 +2,6 @@ from typing import Union, BinaryIO, Optional, Tuple import os import torch -from scipy.io import wavfile import sys # Initialize extension and backend first @@ -46,6 +45,7 @@ # CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack # allows CI to build with ffmpeg4 and works around load/test bugginess. if "pytest" in sys.modules: + from scipy.io import wavfile def load( uri: Union[BinaryIO, str, os.PathLike], frame_offset: int = 0, From 6995b21ebacdb99f9952f6dead2b504284c63496 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:52:30 +0000 Subject: [PATCH 14/35] Revert "Install conda deps before pip deps" This reverts commit 80f5eb7778afd5efc1a2c601583c84ffb5aa2401. --- .github/scripts/unittest-linux/install.sh | 28 +++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index e4fa67b1e5..9f99fd1e98 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,19 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" +pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + +# 2. Install torchaudio +conda install --quiet -y ninja cmake +printf "* Installing torchaudio\n" +export BUILD_CPP_TEST=1 +pip install . -v --no-build-isolation + +# 3. Install Test tools +printf "* Installing test tools\n" +conda install -y "ffmpeg>4" +python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then @@ -84,26 +96,12 @@ if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then fi ( set -x - conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} "ffmpeg>4" libvorbis parameterized 'requests>=2.20' + conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} libvorbis parameterized 'requests>=2.20' pip install SoundFile coverage pytest pytest-cov scipy expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics flashlight-text git+https://github.com/kpu/kenlm # TODO: might be better to fix the single call to `pip install` above pip install pillow scipy "numpy>=1.26" ) - -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" - -# 2. Install torchaudio -conda install --quiet -y ninja cmake - -printf "* Installing torchaudio\n" -export BUILD_CPP_TEST=1 -pip install . -v --no-build-isolation - -# 3. Install Test tools -printf "* Installing test tools\n" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" - # Install fairseq git clone https://github.com/pytorch/fairseq cd fairseq From 4ab5993566d2109b53c92b9b494ea27be5a555b9 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:52:35 +0000 Subject: [PATCH 15/35] Revert "Try ffmpeg>4" This reverts commit 74edc0a8dbe942aae3f04924d1743f4da49800cb. --- .github/scripts/unittest-linux/install.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 9f99fd1e98..15bf71e907 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -85,7 +85,8 @@ pip install . -v --no-build-isolation # 3. Install Test tools printf "* Installing test tools\n" -conda install -y "ffmpeg>4" +# On this CI, for whatever reason, we're only able to install ffmpeg 4. +conda install -y "ffmpeg<5" python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" From 43c460285b61eb4bc412005cad6536e3ac513a3b Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 19:53:21 +0000 Subject: [PATCH 16/35] Revert torchcodec installation changes --- .github/scripts/unittest-linux/install.sh | 1 + .github/workflows/build_docs.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index 15bf71e907..a7ae9bfcf4 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -76,6 +76,7 @@ esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + # 2. Install torchaudio conda install --quiet -y ninja cmake diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index f681e3b7ec..e92c556218 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From f74f00423ade5d7c2a1f426193533a0772a7d40e Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:00:05 +0000 Subject: [PATCH 17/35] Use existing wav_utils --- src/torchaudio/__init__.py | 24 +++++-------------- .../torchaudio/utils}/wav_utils.py | 0 .../common_utils/__init__.py | 2 +- 3 files changed, 7 insertions(+), 19 deletions(-) rename {test/torchaudio_unittest/common_utils => src/torchaudio/utils}/wav_utils.py (100%) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index ca34b996cf..1ff3a530e4 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -45,28 +45,16 @@ # CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack # allows CI to build with ffmpeg4 and works around load/test bugginess. if "pytest" in sys.modules: - from scipy.io import wavfile + from torchaudio.utils import wav_utils def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, + uri: str, + normalize: bool = True, channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, ) -> Tuple[torch.Tensor, int]: - rate, data = wavfile.read(uri) - if data.ndim == 1: - data = data[:,None] - if num_frames == -1: - num_frames = data.shape[0] - frame_offset - data = data[frame_offset:frame_offset + num_frames] - if channels_first: - data = data.T - return data, rate + return wav_utils.load_wav(uri, normalize, channels_first) def save( - uri: Union[str, os.PathLike], + uri: str, src: torch.Tensor, sample_rate: int, channels_first: bool = True, @@ -77,7 +65,7 @@ def save( backend: Optional[str] = None, compression: Optional[Union[float, int]] = None, ): - wavfile.write(uri, sample_rate, src.numpy()) + wav_utils.save_wav(uri, src, sample_rate, channels_first=channels_first) else: def load( uri: Union[BinaryIO, str, os.PathLike], diff --git a/test/torchaudio_unittest/common_utils/wav_utils.py b/src/torchaudio/utils/wav_utils.py similarity index 100% rename from test/torchaudio_unittest/common_utils/wav_utils.py rename to src/torchaudio/utils/wav_utils.py diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index 509d5208df..93ac7e0821 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -26,7 +26,7 @@ from .func_utils import torch_script from .image_utils import get_image, rgb_to_gray, rgb_to_yuv_ccir, save_image from .parameterized_utils import load_params, nested_params -from .wav_utils import get_wav_data, load_wav, normalize_wav, save_wav +from torchaudio.utils.wav_utils import get_wav_data, load_wav, normalize_wav, save_wav import pytest class RequestMixin: From 89ca133522d1d362070f9299b79469c3e10a72eb Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:32:05 +0000 Subject: [PATCH 18/35] Remove _backend folder --- src/torchaudio/__init__.py | 20 - src/torchaudio/_backend/__init__.py | 61 --- src/torchaudio/_backend/backend.py | 53 --- src/torchaudio/_backend/common.py | 52 --- src/torchaudio/_backend/ffmpeg.py | 334 -------------- src/torchaudio/_backend/soundfile.py | 54 --- src/torchaudio/_backend/soundfile_backend.py | 457 ------------------- src/torchaudio/_backend/sox.py | 91 ---- src/torchaudio/_backend/utils.py | 350 -------------- src/torchaudio/backend/__init__.py | 8 - src/torchaudio/backend/_no_backend.py | 25 - src/torchaudio/backend/_sox_io_backend.py | 294 ------------ src/torchaudio/backend/common.py | 13 - src/torchaudio/backend/no_backend.py | 14 - src/torchaudio/backend/soundfile_backend.py | 14 - src/torchaudio/backend/sox_io_backend.py | 14 - 16 files changed, 1854 deletions(-) delete mode 100644 src/torchaudio/_backend/__init__.py delete mode 100644 src/torchaudio/_backend/backend.py delete mode 100644 src/torchaudio/_backend/common.py delete mode 100644 src/torchaudio/_backend/ffmpeg.py delete mode 100644 src/torchaudio/_backend/soundfile.py delete mode 100644 src/torchaudio/_backend/soundfile_backend.py delete mode 100644 src/torchaudio/_backend/sox.py delete mode 100644 src/torchaudio/_backend/utils.py delete mode 100644 src/torchaudio/backend/__init__.py delete mode 100644 src/torchaudio/backend/_no_backend.py delete mode 100644 src/torchaudio/backend/_sox_io_backend.py delete mode 100644 src/torchaudio/backend/common.py delete mode 100644 src/torchaudio/backend/no_backend.py delete mode 100644 src/torchaudio/backend/soundfile_backend.py delete mode 100644 src/torchaudio/backend/sox_io_backend.py diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1ff3a530e4..b226210547 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -6,21 +6,8 @@ # Initialize extension and backend first from . import _extension # noqa # usort: skip -from ._backend import ( # noqa # usort: skip - AudioMetaData as _AudioMetaData, - get_audio_backend as _get_audio_backend, - info as _info, - list_audio_backends as _list_audio_backends, - set_audio_backend as _set_audio_backend, -) from ._torchcodec import load_with_torchcodec, save_with_torchcodec -AudioMetaData = dropping_class_io_support(_AudioMetaData) -get_audio_backend = dropping_io_support(_get_audio_backend) -info = dropping_io_support(_info) -list_audio_backends = dropping_io_support(_list_audio_backends) -set_audio_backend = dropping_io_support(_set_audio_backend) - from . import ( # noqa: F401 compliance, datasets, @@ -34,8 +21,6 @@ utils, ) -# For BC -from . import backend # noqa # usort: skip try: from .version import __version__, git_version # noqa: F401 @@ -234,11 +219,9 @@ def save( compression=compression) __all__ = [ - "AudioMetaData", "load", "load_with_torchcodec", "save_with_torchcodec", - "info", "save", "io", "compliance", @@ -250,7 +233,4 @@ def save( "utils", "sox_effects", "transforms", - "list_audio_backends", - "get_audio_backend", - "set_audio_backend", ] diff --git a/src/torchaudio/_backend/__init__.py b/src/torchaudio/_backend/__init__.py deleted file mode 100644 index 27337013ff..0000000000 --- a/src/torchaudio/_backend/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import List, Optional - -from torchaudio._internal.module_utils import deprecated - -from . import utils -from .common import AudioMetaData - -__all__ = [ - "AudioMetaData", - "load", - "info", - "save", - "list_audio_backends", - "get_audio_backend", - "set_audio_backend", -] - - -info = utils.get_info_func() -load = utils.get_load_func() -save = utils.get_save_func() - - -def list_audio_backends() -> List[str]: - """List available backends - - Returns: - list of str: The list of available backends. - - The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``. - """ - - return list(utils.get_available_backends().keys()) - - -# Temporary until global backend is removed -@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.") -def get_audio_backend() -> Optional[str]: - """Get the name of the current global backend - - Returns: - str or None: - If dispatcher mode is enabled, returns ``None`` otherwise, - the name of current backend or ``None`` (no backend is set). - """ - return None - - -# Temporary until global backend is removed -@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.") -def set_audio_backend(backend: Optional[str]): # noqa - """Set the global backend. - - This is a no-op when dispatcher mode is enabled. - - Args: - backend (str or None): Name of the backend. - One of ``"sox_io"`` or ``"soundfile"`` based on availability - of the system. If ``None`` is provided the current backend is unassigned. - """ - pass diff --git a/src/torchaudio/_backend/backend.py b/src/torchaudio/_backend/backend.py deleted file mode 100644 index 579340962c..0000000000 --- a/src/torchaudio/_backend/backend.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from abc import ABC, abstractmethod -from typing import BinaryIO, Optional, Tuple, Union - -from torch import Tensor -from torchaudio.io import CodecConfig - -from .common import AudioMetaData - - -class Backend(ABC): - @staticmethod - @abstractmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - raise NotImplementedError - - @staticmethod - @abstractmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[Tensor, int]: - raise NotImplementedError - - @staticmethod - @abstractmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[CodecConfig, float, int]] = None, - ) -> None: - raise NotImplementedError - - @staticmethod - @abstractmethod - def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - raise NotImplementedError - - @staticmethod - @abstractmethod - def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - raise NotImplementedError diff --git a/src/torchaudio/_backend/common.py b/src/torchaudio/_backend/common.py deleted file mode 100644 index 804b18d461..0000000000 --- a/src/torchaudio/_backend/common.py +++ /dev/null @@ -1,52 +0,0 @@ -class AudioMetaData: - """AudioMetaData() - - Return type of ``torchaudio.info`` function. - - :ivar int sample_rate: Sample rate - :ivar int num_frames: The number of frames - :ivar int num_channels: The number of channels - :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats, - or when it cannot be accurately inferred. - :ivar str encoding: Audio encoding - The values encoding can take are one of the following: - - * ``PCM_S``: Signed integer linear PCM - * ``PCM_U``: Unsigned integer linear PCM - * ``PCM_F``: Floating point linear PCM - * ``FLAC``: Flac, Free Lossless Audio Codec - * ``ULAW``: Mu-law - * ``ALAW``: A-law - * ``MP3`` : MP3, MPEG-1 Audio Layer III - * ``VORBIS``: OGG Vorbis - * ``AMR_WB``: Adaptive Multi-Rate Wideband - * ``AMR_NB``: Adaptive Multi-Rate Narrowband - * ``OPUS``: Opus - * ``HTK``: Single channel 16-bit PCM - * ``UNKNOWN`` : None of above - """ - - def __init__( - self, - sample_rate: int, - num_frames: int, - num_channels: int, - bits_per_sample: int, - encoding: str, - ): - self.sample_rate = sample_rate - self.num_frames = num_frames - self.num_channels = num_channels - self.bits_per_sample = bits_per_sample - self.encoding = encoding - - def __str__(self): - return ( - f"AudioMetaData(" - f"sample_rate={self.sample_rate}, " - f"num_frames={self.num_frames}, " - f"num_channels={self.num_channels}, " - f"bits_per_sample={self.bits_per_sample}, " - f"encoding={self.encoding}" - f")" - ) diff --git a/src/torchaudio/_backend/ffmpeg.py b/src/torchaudio/_backend/ffmpeg.py deleted file mode 100644 index ca8374ea07..0000000000 --- a/src/torchaudio/_backend/ffmpeg.py +++ /dev/null @@ -1,334 +0,0 @@ -import os -import re -import sys -from typing import BinaryIO, Optional, Tuple, Union - -import torch -import torchaudio - -from .backend import Backend -from .common import AudioMetaData - -InputType = Union[BinaryIO, str, os.PathLike] - - -def info_audio( - src: InputType, - format: Optional[str], - buffer_size: int = 4096, -) -> AudioMetaData: - s = torchaudio.io.StreamReader(src, format, None, buffer_size) - sinfo = s.get_src_stream_info(s.default_audio_stream) - if sinfo.num_frames == 0: - waveform = _load_audio(s) - num_frames = waveform.size(1) - else: - num_frames = sinfo.num_frames - return AudioMetaData( - int(sinfo.sample_rate), - num_frames, - sinfo.num_channels, - sinfo.bits_per_sample, - sinfo.codec.upper(), - ) - - -def _get_load_filter( - frame_offset: int = 0, - num_frames: int = -1, - convert: bool = True, -) -> Optional[str]: - if frame_offset < 0: - raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset)) - if num_frames == 0 or num_frames < -1: - raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames)) - - # All default values -> no filter - if frame_offset == 0 and num_frames == -1 and not convert: - return None - # Only convert - aformat = "aformat=sample_fmts=fltp" - if frame_offset == 0 and num_frames == -1 and convert: - return aformat - # At least one of frame_offset or num_frames has non-default value - if num_frames > 0: - atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames) - else: - atrim = "atrim=start_sample={}".format(frame_offset) - if not convert: - return atrim - return "{},{}".format(atrim, aformat) - - -def _load_audio( - s: "torchaudio.io.StreamReader", - filter: Optional[str] = None, - channels_first: bool = True, -) -> torch.Tensor: - s.add_audio_stream(-1, -1, filter_desc=filter) - s.process_all_packets() - chunk = s.pop_chunks()[0] - if chunk is None: - raise RuntimeError("Failed to decode audio.") - waveform = chunk._elem - return waveform.T if channels_first else waveform - - -def load_audio( - src: InputType, - frame_offset: int = 0, - num_frames: int = -1, - convert: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, -) -> Tuple[torch.Tensor, int]: - if hasattr(src, "read") and format == "vorbis": - format = "ogg" - s = torchaudio.io.StreamReader(src, format, None, buffer_size) - sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate) - filter = _get_load_filter(frame_offset, num_frames, convert) - waveform = _load_audio(s, filter, channels_first) - return waveform, sample_rate - - -def _get_sample_format(dtype: torch.dtype) -> str: - dtype_to_format = { - torch.uint8: "u8", - torch.int16: "s16", - torch.int32: "s32", - torch.int64: "s64", - torch.float32: "flt", - torch.float64: "dbl", - } - format = dtype_to_format.get(dtype) - if format is None: - raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.") - return format - - -def _native_endianness() -> str: - if sys.byteorder == "little": - return "le" - else: - return "be" - - -def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str: - if bits_per_sample not in {None, 8, 16, 24, 32, 64}: - raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.") - endianness = _native_endianness() - if not encoding: - if not bits_per_sample: - # default to PCM S16 - return f"pcm_s16{endianness}" - if bits_per_sample == 8: - return "pcm_u8" - return f"pcm_s{bits_per_sample}{endianness}" - if encoding == "PCM_S": - if not bits_per_sample: - bits_per_sample = 16 - if bits_per_sample == 8: - raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.") - return f"pcm_s{bits_per_sample}{endianness}" - if encoding == "PCM_U": - if bits_per_sample in (None, 8): - return "pcm_u8" - raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.") - if encoding == "PCM_F": - if not bits_per_sample: - bits_per_sample = 32 - if bits_per_sample in (32, 64): - return f"pcm_f{bits_per_sample}{endianness}" - raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "pcm_mulaw" - raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.") - if encoding == "ALAW": - if bits_per_sample in (None, 8): - return "pcm_alaw" - raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.") - raise ValueError(f"WAV encoding {encoding} is not supported.") - - -def _get_flac_sample_fmt(bps): - if bps is None or bps == 16: - return "s16" - if bps == 24: - return "s32" - raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).") - - -def _parse_save_args( - ext: Optional[str], - format: Optional[str], - encoding: Optional[str], - bps: Optional[int], -): - # torchaudio's save function accepts the followings, which do not 1to1 map - # to FFmpeg. - # - # - format: audio format - # - bits_per_sample: encoder sample format - # - encoding: such as PCM_U8. - # - # In FFmpeg, format is specified with the following three (and more) - # - # - muxer: could be audio format or container format. - # the one we passed to the constructor of StreamWriter - # - encoder: the audio encoder used to encode audio - # - encoder sample format: the format used by encoder to encode audio. - # - # If encoder sample format is different from source sample format, StreamWriter - # will insert a filter automatically. - # - def _type(spec): - # either format is exactly the specified one - # or extension matches to the spec AND there is no format override. - return format == spec or (format is None and ext == spec) - - if _type("wav") or _type("amb"): - # wav is special because it supports different encoding through encoders - # each encoder only supports one encoder format - # - # amb format is a special case originated from libsox. - # It is basically a WAV format, with slight modification. - # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795 - # It is a format so that decoders will recognize it as ambisonic. - # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/ - # FFmpeg does not recognize amb because it is basically a WAV format. - muxer = "wav" - encoder = _get_encoder_for_wav(encoding, bps) - sample_fmt = None - elif _type("vorbis"): - # FFpmeg does not recognize vorbis extension, while libsox used to do. - # For the sake of bakward compatibility, (and the simplicity), - # we support the case where users want to do save("foo.vorbis") - muxer = "ogg" - encoder = "vorbis" - sample_fmt = None - else: - muxer = format - encoder = None - sample_fmt = None - if _type("flac"): - sample_fmt = _get_flac_sample_fmt(bps) - if _type("ogg"): - sample_fmt = _get_flac_sample_fmt(bps) - return muxer, encoder, sample_fmt - - -def save_audio( - uri: InputType, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[torchaudio.io.CodecConfig] = None, -) -> None: - ext = None - if hasattr(uri, "write"): - if format is None: - raise RuntimeError("'format' is required when saving to file object.") - else: - uri = os.path.normpath(uri) - if tokens := str(uri).split(".")[1:]: - ext = tokens[-1].lower() - - muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample) - - if channels_first: - src = src.T - - s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size) - s.add_audio_stream( - sample_rate, - num_channels=src.size(-1), - format=_get_sample_format(src.dtype), - encoder=encoder, - encoder_format=enc_fmt, - codec_config=compression, - ) - with s.open(): - s.write_audio_chunk(0, src) - - -def _map_encoding(encoding: str) -> str: - for dst in ["PCM_S", "PCM_U", "PCM_F"]: - if dst in encoding: - return dst - if encoding == "PCM_MULAW": - return "ULAW" - elif encoding == "PCM_ALAW": - return "ALAW" - return encoding - - -def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str: - if m := re.search(r"PCM_\w(\d+)\w*", encoding): - return int(m.group(1)) - elif encoding in ["PCM_ALAW", "PCM_MULAW"]: - return 8 - return bits_per_sample - - -class FFmpegBackend(Backend): - @staticmethod - def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - metadata = info_audio(uri, format, buffer_size) - metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample) - metadata.encoding = _map_encoding(metadata.encoding) - return metadata - - @staticmethod - def load( - uri: InputType, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format) - - @staticmethod - def save( - uri: InputType, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None, - ) -> None: - if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))): - raise ValueError( - "FFmpeg backend expects non-`None` value for argument `compression` to be of ", - f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}", - ) - save_audio( - uri, - src, - sample_rate, - channels_first, - format, - encoding, - bits_per_sample, - buffer_size, - compression, - ) - - @staticmethod - def can_decode(uri: InputType, format: Optional[str]) -> bool: - return True - - @staticmethod - def can_encode(uri: InputType, format: Optional[str]) -> bool: - return True diff --git a/src/torchaudio/_backend/soundfile.py b/src/torchaudio/_backend/soundfile.py deleted file mode 100644 index f4be1f7099..0000000000 --- a/src/torchaudio/_backend/soundfile.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from typing import BinaryIO, Optional, Tuple, Union - -import torch -from torchaudio.io import CodecConfig - -from . import soundfile_backend -from .backend import Backend -from .common import AudioMetaData - - -class SoundfileBackend(Backend): - @staticmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - return soundfile_backend.info(uri, format) - - @staticmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format) - - @staticmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[CodecConfig, float, int]] = None, - ) -> None: - if compression: - raise ValueError("soundfile backend does not support argument `compression`.") - - soundfile_backend.save( - uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample - ) - - @staticmethod - def can_decode(uri, format) -> bool: - return True - - @staticmethod - def can_encode(uri, format) -> bool: - return True diff --git a/src/torchaudio/_backend/soundfile_backend.py b/src/torchaudio/_backend/soundfile_backend.py deleted file mode 100644 index 9e7b0b13cd..0000000000 --- a/src/torchaudio/_backend/soundfile_backend.py +++ /dev/null @@ -1,457 +0,0 @@ -"""The new soundfile backend which will become default in 0.8.0 onward""" -import warnings -from typing import Optional, Tuple - -import torch -from torchaudio._internal import module_utils as _mod_utils - -from .common import AudioMetaData - - -_IS_SOUNDFILE_AVAILABLE = False - -# TODO: import soundfile only when it is used. -if _mod_utils.is_module_available("soundfile"): - try: - import soundfile - - _requires_soundfile = _mod_utils.no_op - _IS_SOUNDFILE_AVAILABLE = True - except Exception: - _requires_soundfile = _mod_utils.fail_with_message( - "requires soundfile, but we failed to import it. Please check the installation of soundfile." - ) -else: - _requires_soundfile = _mod_utils.fail_with_message( - "requires soundfile, but it is not installed. Please install soundfile." - ) - - -# Mapping from soundfile subtype to number of bits per sample. -# This is mostly heuristical and the value is set to 0 when it is irrelevant -# (lossy formats) or when it can't be inferred. -# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard: -# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony, -# the default seems to be 8 bits but it can be compressed further to 4 bits. -# The dict is inspired from -# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94 -_SUBTYPE_TO_BITS_PER_SAMPLE = { - "PCM_S8": 8, # Signed 8 bit data - "PCM_16": 16, # Signed 16 bit data - "PCM_24": 24, # Signed 24 bit data - "PCM_32": 32, # Signed 32 bit data - "PCM_U8": 8, # Unsigned 8 bit data (WAV and RAW only) - "FLOAT": 32, # 32 bit float data - "DOUBLE": 64, # 64 bit float data - "ULAW": 8, # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types - "ALAW": 8, # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types - "IMA_ADPCM": 0, # IMA ADPCM. - "MS_ADPCM": 0, # Microsoft ADPCM. - "GSM610": 0, # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate) - "VOX_ADPCM": 0, # OKI / Dialogix ADPCM - "G721_32": 0, # 32kbs G721 ADPCM encoding. - "G723_24": 0, # 24kbs G723 ADPCM encoding. - "G723_40": 0, # 40kbs G723 ADPCM encoding. - "DWVW_12": 12, # 12 bit Delta Width Variable Word encoding. - "DWVW_16": 16, # 16 bit Delta Width Variable Word encoding. - "DWVW_24": 24, # 24 bit Delta Width Variable Word encoding. - "DWVW_N": 0, # N bit Delta Width Variable Word encoding. - "DPCM_8": 8, # 8 bit differential PCM (XI only) - "DPCM_16": 16, # 16 bit differential PCM (XI only) - "VORBIS": 0, # Xiph Vorbis encoding. (lossy) - "ALAC_16": 16, # Apple Lossless Audio Codec (16 bit). - "ALAC_20": 20, # Apple Lossless Audio Codec (20 bit). - "ALAC_24": 24, # Apple Lossless Audio Codec (24 bit). - "ALAC_32": 32, # Apple Lossless Audio Codec (32 bit). -} - - -def _get_bit_depth(subtype): - if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE: - warnings.warn( - f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample " - "attribute will be set to 0. If you are seeing this warning, please " - "report by opening an issue on github (after checking for existing/closed ones). " - "You may otherwise ignore this warning." - ) - return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0) - - -_SUBTYPE_TO_ENCODING = { - "PCM_S8": "PCM_S", - "PCM_16": "PCM_S", - "PCM_24": "PCM_S", - "PCM_32": "PCM_S", - "PCM_U8": "PCM_U", - "FLOAT": "PCM_F", - "DOUBLE": "PCM_F", - "ULAW": "ULAW", - "ALAW": "ALAW", - "VORBIS": "VORBIS", -} - - -def _get_encoding(format: str, subtype: str): - if format == "FLAC": - return "FLAC" - return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN") - - -@_requires_soundfile -def info(filepath: str, format: Optional[str] = None) -> AudioMetaData: - """Get signal information of an audio file. - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - which has a restriction on type annotation due to TorchScript compiler compatiblity. - - Args: - filepath (path-like object or file-like object): - Source of audio data. - format (str or None, optional): - Not used. PySoundFile does not accept format hint. - - Returns: - AudioMetaData: meta data of the given audio. - - """ - sinfo = soundfile.info(filepath) - return AudioMetaData( - sinfo.samplerate, - sinfo.frames, - sinfo.channels, - bits_per_sample=_get_bit_depth(sinfo.subtype), - encoding=_get_encoding(sinfo.format, sinfo.subtype), - ) - - -_SUBTYPE2DTYPE = { - "PCM_S8": "int8", - "PCM_U8": "uint8", - "PCM_16": "int16", - "PCM_32": "int32", - "FLOAT": "float32", - "DOUBLE": "float64", -} - - -@_requires_soundfile -def load( - filepath: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from file. - - Note: - The formats this function can handle depend on the soundfile installation. - This function is tested on the following formats; - - * WAV - - * 32-bit floating-point - * 32-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer - - * FLAC - * OGG/VORBIS - * SPHERE - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype, and the shape of `[channel, time]`. - - .. warning:: - - ``normalize`` argument does not perform volume normalization. - It only converts the sample type to `torch.float32` from the native sample - type. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, - this function can return integer Tensor, where the samples are expressed within the whole range - of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, - ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not - support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. - - ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - - For these formats, this function always returns ``float32`` Tensor with values. - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - which has a restriction on type annotation due to TorchScript compiler compatiblity. - - Args: - filepath (path-like object or file-like object): - Source of audio data. - frame_offset (int, optional): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Not used. PySoundFile does not accept format hint. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and normalization is off, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - with soundfile.SoundFile(filepath, "r") as file_: - if file_.format != "WAV" or normalize: - dtype = "float32" - elif file_.subtype not in _SUBTYPE2DTYPE: - raise ValueError(f"Unsupported subtype: {file_.subtype}") - else: - dtype = _SUBTYPE2DTYPE[file_.subtype] - - frames = file_._prepare_read(frame_offset, None, num_frames) - waveform = file_.read(frames, dtype, always_2d=True) - sample_rate = file_.samplerate - - waveform = torch.from_numpy(waveform) - if channels_first: - waveform = waveform.t() - return waveform, sample_rate - - -def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int): - if not encoding: - if not bits_per_sample: - subtype = { - torch.uint8: "PCM_U8", - torch.int16: "PCM_16", - torch.int32: "PCM_32", - torch.float32: "FLOAT", - torch.float64: "DOUBLE", - }.get(dtype) - if not subtype: - raise ValueError(f"Unsupported dtype for wav: {dtype}") - return subtype - if bits_per_sample == 8: - return "PCM_U8" - return f"PCM_{bits_per_sample}" - if encoding == "PCM_S": - if not bits_per_sample: - return "PCM_32" - if bits_per_sample == 8: - raise ValueError("wav does not support 8-bit signed PCM encoding.") - return f"PCM_{bits_per_sample}" - if encoding == "PCM_U": - if bits_per_sample in (None, 8): - return "PCM_U8" - raise ValueError("wav only supports 8-bit unsigned PCM encoding.") - if encoding == "PCM_F": - if bits_per_sample in (None, 32): - return "FLOAT" - if bits_per_sample == 64: - return "DOUBLE" - raise ValueError("wav only supports 32/64-bit float PCM encoding.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "ULAW" - raise ValueError("wav only supports 8-bit mu-law encoding.") - if encoding == "ALAW": - if bits_per_sample in (None, 8): - return "ALAW" - raise ValueError("wav only supports 8-bit a-law encoding.") - raise ValueError(f"wav does not support {encoding}.") - - -def _get_subtype_for_sphere(encoding: str, bits_per_sample: int): - if encoding in (None, "PCM_S"): - return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32" - if encoding in ("PCM_U", "PCM_F"): - raise ValueError(f"sph does not support {encoding} encoding.") - if encoding == "ULAW": - if bits_per_sample in (None, 8): - return "ULAW" - raise ValueError("sph only supports 8-bit for mu-law encoding.") - if encoding == "ALAW": - return "ALAW" - raise ValueError(f"sph does not support {encoding}.") - - -def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int): - if format == "wav": - return _get_subtype_for_wav(dtype, encoding, bits_per_sample) - if format == "flac": - if encoding: - raise ValueError("flac does not support encoding.") - if not bits_per_sample: - return "PCM_16" - if bits_per_sample > 24: - raise ValueError("flac does not support bits_per_sample > 24.") - return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}" - if format in ("ogg", "vorbis"): - if bits_per_sample: - raise ValueError("ogg/vorbis does not support bits_per_sample.") - if encoding is None or encoding == "vorbis": - return "VORBIS" - if encoding == "opus": - return "OPUS" - raise ValueError(f"Unexpected encoding: {encoding}") - if format == "mp3": - return "MPEG_LAYER_III" - if format == "sph": - return _get_subtype_for_sphere(encoding, bits_per_sample) - if format in ("nis", "nist"): - return "PCM_16" - raise ValueError(f"Unsupported format: {format}") - - -@_requires_soundfile -def save( - filepath: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - compression: Optional[float] = None, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, -): - """Save audio data to file. - - Note: - The formats this function can handle depend on the soundfile installation. - This function is tested on the following formats; - - * WAV - - * 32-bit floating-point - * 32-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer - - * FLAC - * OGG/VORBIS - * SPHERE - - Note: - ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts - ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend, - which has a restriction on type annotation due to TorchScript compiler compatiblity. - - Args: - filepath (str or pathlib.Path): Path to audio file. - src (torch.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - compression (float of None, optional): Not used. - It is here only for interface compatibility reson with "sox_io" backend. - format (str or None, optional): Override the audio format. - When ``filepath`` argument is path-like object, audio format is - inferred from file extension. If the file extension is missing or - different, you can specify the correct format with this argument. - - When ``filepath`` argument is file-like object, - this argument is required. - - Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``, - ``"flac"`` and ``"sph"``. - encoding (str or None, optional): Changes the encoding for supported formats. - This argument is effective only for supported formats, sush as - ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are; - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - bits_per_sample (int or None, optional): Changes the bit depth for the - supported formats. - When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``, - you can change the bit depth. - Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. - - Supported formats/encodings/bit depth/compression are: - - ``"wav"`` - - 32-bit floating-point PCM - - 32-bit signed integer PCM - - 24-bit signed integer PCM - - 16-bit signed integer PCM - - 8-bit unsigned integer PCM - - 8-bit mu-law - - 8-bit a-law - - Note: - Default encoding/bit depth is determined by the dtype of - the input Tensor. - - ``"flac"`` - - 8-bit - - 16-bit (default) - - 24-bit - - ``"ogg"``, ``"vorbis"`` - - Doesn't accept changing configuration. - - ``"sph"`` - - 8-bit signed integer PCM - - 16-bit signed integer PCM - - 24-bit signed integer PCM - - 32-bit signed integer PCM (default) - - 8-bit mu-law - - 8-bit a-law - - 16-bit a-law - - 24-bit a-law - - 32-bit a-law - - """ - if src.ndim != 2: - raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.") - if compression is not None: - warnings.warn( - '`save` function of "soundfile" backend does not support "compression" parameter. ' - "The argument is silently ignored." - ) - if hasattr(filepath, "write"): - if format is None: - raise RuntimeError("`format` is required when saving to file object.") - ext = format.lower() - else: - ext = str(filepath).split(".")[-1].lower() - - if bits_per_sample not in (None, 8, 16, 24, 32, 64): - raise ValueError("Invalid bits_per_sample.") - if bits_per_sample == 24: - warnings.warn( - "Saving audio with 24 bits per sample might warp samples near -1. " - "Using 16 bits per sample might be able to avoid this." - ) - subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample) - - # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format, - # so we extend the extensions manually here - if ext in ["nis", "nist", "sph"] and format is None: - format = "NIST" - - if channels_first: - src = src.t() - - soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format) diff --git a/src/torchaudio/_backend/sox.py b/src/torchaudio/_backend/sox.py deleted file mode 100644 index f26ce83ca0..0000000000 --- a/src/torchaudio/_backend/sox.py +++ /dev/null @@ -1,91 +0,0 @@ -import os -from typing import BinaryIO, Optional, Tuple, Union - -import torch -import torchaudio - -from .backend import Backend -from .common import AudioMetaData - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -class SoXBackend(Backend): - @staticmethod - def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData: - if hasattr(uri, "read"): - raise ValueError( - "SoX backend does not support reading from file-like objects. ", - "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.", - ) - else: - sinfo = sox_ext.get_info(uri, format) - if sinfo: - return AudioMetaData(*sinfo) - else: - raise RuntimeError(f"Failed to fetch metadata for {uri}.") - - @staticmethod - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - ) -> Tuple[torch.Tensor, int]: - if hasattr(uri, "read"): - raise ValueError( - "SoX backend does not support loading from file-like objects. ", - "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.", - ) - else: - ret = sox_ext.load_audio_file(str(uri), frame_offset, num_frames, normalize, channels_first, format) - if not ret: - raise RuntimeError(f"Failed to load audio from {uri}.") - return ret - - @staticmethod - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None, - ) -> None: - if not isinstance(compression, (float, int, type(None))): - raise ValueError( - "SoX backend expects non-`None` value for argument `compression` to be of ", - f"type `float` or `int`, but received value of type {type(compression)}", - ) - if hasattr(uri, "write"): - raise ValueError( - "SoX backend does not support writing to file-like objects. ", - "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.", - ) - else: - sox_ext.save_audio_file( - str(uri), - src, - sample_rate, - channels_first, - compression, - format, - encoding, - bits_per_sample, - ) - - @staticmethod - def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - # i.e. not a file-like object. - return not hasattr(uri, "read") - - @staticmethod - def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool: - # i.e. not a file-like object. - return not hasattr(uri, "write") diff --git a/src/torchaudio/_backend/utils.py b/src/torchaudio/_backend/utils.py deleted file mode 100644 index eb7c51f0cb..0000000000 --- a/src/torchaudio/_backend/utils.py +++ /dev/null @@ -1,350 +0,0 @@ -import os -from functools import lru_cache -from typing import BinaryIO, Dict, Optional, Tuple, Type, Union -import warnings - -import torch - -from torchaudio._extension import lazy_import_sox_ext -from torchaudio.io import CodecConfig -from torio._extension import lazy_import_ffmpeg_ext - -from . import soundfile_backend - -from .backend import Backend -from .common import AudioMetaData -from .ffmpeg import FFmpegBackend -from .soundfile import SoundfileBackend -from .sox import SoXBackend - - -@lru_cache(None) -def get_available_backends() -> Dict[str, Type[Backend]]: - backend_specs: Dict[str, Type[Backend]] = {} - if lazy_import_ffmpeg_ext().is_available(): - backend_specs["ffmpeg"] = FFmpegBackend - if lazy_import_sox_ext().is_available(): - backend_specs["sox"] = SoXBackend - if soundfile_backend._IS_SOUNDFILE_AVAILABLE: - backend_specs["soundfile"] = SoundfileBackend - return backend_specs - - -def get_backend(backend_name, backends) -> Backend: - if backend := backends.get(backend_name): - return backend - else: - raise ValueError( - f"Unsupported backend '{backend_name}' specified; ", - f"please select one of {list(backends.keys())} instead.", - ) - - -def get_info_func(): - backends = get_available_backends() - - def dispatcher( - uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] - ) -> Backend: - if backend_name is not None: - return get_backend(backend_name, backends) - - for backend in backends.values(): - if backend.can_decode(uri, format): - return backend - raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") - - def info( - uri: Union[BinaryIO, str, os.PathLike], - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> AudioMetaData: - """Get signal information of an audio file. - - Note: - When the input type is file-like object, this function cannot - get the correct length (``num_samples``) for certain formats, - such as ``vorbis``. - In this case, the value of ``num_samples`` is ``0``. - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method, - which returns byte string of at most ``size`` length. - - format (str or None, optional): - If not ``None``, interpreted as hint that may allow backend to override the detected format. - (Default: ``None``) - - buffer_size (int, optional): - Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) - - backend (str or None, optional): - I/O backend to use. - If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], - with the corresponding backend available. - (Default: ``None``) - - .. seealso:: - :ref:`backend` - - Returns: - AudioMetaData - """ - backend = dispatcher(uri, format, backend) - return backend.info(uri, format, buffer_size) - - return info - - -def get_load_func(): - backends = get_available_backends() - - def dispatcher( - uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] - ) -> Backend: - if backend_name is not None: - return get_backend(backend_name, backends) - - for backend in backends.values(): - if backend.can_decode(uri, format): - return backend - raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") - - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> Tuple[torch.Tensor, int]: - """Load audio data from source. - - .. warning:: - In 2.9, this function's implementation will be changed to use - :func:`~torchaudio.load_with_torchcodec` under the hood. Some - parameters like ``normalize``, ``format``, ``buffer_size``, and - ``backend`` will be ignored. We recommend that you port your code to - rely directly on TorchCodec's decoder instead: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder. - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype, and the shape of `[channel, time]`. - - Note: - The formats this function can handle depend on the availability of backends. - Please use the following functions to fetch the supported formats. - - - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders` - - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats` - - SoundFile: Refer to `the official document `__. - - .. warning:: - - ``normalize`` argument does not perform volume normalization. - It only converts the sample type to `torch.float32` from the native sample - type. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, - this function can return integer Tensor, where the samples are expressed within the whole range - of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, - ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not - support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. - - ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - - For these formats, this function always returns ``float32`` Tensor with values. - - - Args: - uri (path-like object or file-like object): - Source of audio data. - frame_offset (int, optional): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - - format (str or None, optional): - If not ``None``, interpreted as hint that may allow backend to override the detected format. - (Default: ``None``) - - buffer_size (int, optional): - Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) - - backend (str or None, optional): - I/O backend to use. - If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], - with the corresponding backend being available. (Default: ``None``) - - .. seealso:: - :ref:`backend` - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and normalization is off, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - warnings.warn( - "In 2.9, this function's implementation will be changed to use " - "torchaudio.load_with_torchcodec` under the hood. Some " - "parameters like ``normalize``, ``format``, ``buffer_size``, and " - "``backend`` will be ignored. We recommend that you port your code to " - "rely directly on TorchCodec's decoder instead: " - "https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder." - ) - backend = dispatcher(uri, format, backend) - return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size) - - return load - - -def get_save_func(): - backends = get_available_backends() - - def dispatcher( - uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] - ) -> Backend: - if backend_name is not None: - return get_backend(backend_name, backends) - - for backend in backends.values(): - if backend.can_encode(uri, format): - return backend - raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") - - def save( - uri: Union[BinaryIO, str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[CodecConfig, float, int]] = None, - ): - """Save audio data to file. - - .. warning:: - In 2.9, this function's implementation will be changed to use - :func:`~torchaudio.save_with_torchcodec` under the hood. Some - parameters like format, encoding, bits_per_sample, buffer_size, and - ``backend`` will be ignored. We recommend that you port your code to - rely directly on TorchCodec's decoder instead: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder - - Note: - The formats this function can handle depend on the availability of backends. - Please use the following functions to fetch the supported formats. - - - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders` - - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats` - - SoundFile: Refer to `the official document `__. - - Args: - uri (str or pathlib.Path): Path to audio file. - src (torch.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - format (str or None, optional): Override the audio format. - When ``uri`` argument is path-like object, audio format is - inferred from file extension. If the file extension is missing or - different, you can specify the correct format with this argument. - - When ``uri`` argument is file-like object, - this argument is required. - - Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``. - encoding (str or None, optional): Changes the encoding for supported formats. - This argument is effective only for supported formats, i.e. - ``"wav"`` and ``""flac"```. Valid values are - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - bits_per_sample (int or None, optional): Changes the bit depth for the - supported formats. - When ``format`` is one of ``"wav"`` and ``"flac"``, - you can change the bit depth. - Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. - - buffer_size (int, optional): - Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) - - backend (str or None, optional): - I/O backend to use. - If ``None``, function selects backend given input and available backends. - Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], - with the corresponding backend being available. - (Default: ``None``) - - .. seealso:: - :ref:`backend` - - compression (CodecConfig, float, int, or None, optional): - Compression configuration to apply. - - If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided. - - Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the - ``sox`` command line interface must be provided. For instance: - - ``"mp3"`` - Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or - VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``. - - ``"flac"`` - Whole number from ``0`` to ``8``. ``8`` is default and highest compression. - - ``"ogg"``, ``"vorbis"`` - Number from ``-1`` to ``10``; ``-1`` is the highest compression - and lowest quality. Default: ``3``. - - Refer to http://sox.sourceforge.net/soxformat.html for more details. - - """ - warnings.warn( - "In 2.9, this function's implementation will be changed to use " - "torchaudio.save_with_torchcodec` under the hood. Some " - "parameters like format, encoding, bits_per_sample, buffer_size, and " - "``backend`` will be ignored. We recommend that you port your code to " - "rely directly on TorchCodec's encoder instead: " - "https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder" - ) - backend = dispatcher(uri, format, backend) - return backend.save( - uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression - ) - - return save diff --git a/src/torchaudio/backend/__init__.py b/src/torchaudio/backend/__init__.py deleted file mode 100644 index 84df7e7d69..0000000000 --- a/src/torchaudio/backend/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# NOTE: -# The entire `torchaudio.backend` module is deprecated. -# New things should be added to `torchaudio._backend`. -# Only things related to backward compatibility should be placed here. - -from . import common, no_backend, soundfile_backend, sox_io_backend # noqa - -__all__ = [] diff --git a/src/torchaudio/backend/_no_backend.py b/src/torchaudio/backend/_no_backend.py deleted file mode 100644 index fcbb2ad84a..0000000000 --- a/src/torchaudio/backend/_no_backend.py +++ /dev/null @@ -1,25 +0,0 @@ -from pathlib import Path -from typing import Callable, Optional, Tuple, Union - -from torch import Tensor -from torchaudio import AudioMetaData - - -def load( - filepath: Union[str, Path], - out: Optional[Tensor] = None, - normalization: Union[bool, float, Callable] = True, - channels_first: bool = True, - num_frames: int = 0, - offset: int = 0, - filetype: Optional[str] = None, -) -> Tuple[Tensor, int]: - raise RuntimeError("No audio I/O backend is available.") - - -def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None: - raise RuntimeError("No audio I/O backend is available.") - - -def info(filepath: str) -> AudioMetaData: - raise RuntimeError("No audio I/O backend is available.") diff --git a/src/torchaudio/backend/_sox_io_backend.py b/src/torchaudio/backend/_sox_io_backend.py deleted file mode 100644 index 6af267b17a..0000000000 --- a/src/torchaudio/backend/_sox_io_backend.py +++ /dev/null @@ -1,294 +0,0 @@ -import os -from typing import Optional, Tuple - -import torch -import torchaudio -from torchaudio import AudioMetaData - -sox_ext = torchaudio._extension.lazy_import_sox_ext() - - -def info( - filepath: str, - format: Optional[str] = None, -) -> AudioMetaData: - """Get signal information of an audio file. - - Args: - filepath (str): - Source of audio data. - - format (str or None, optional): - Override the format detection with the given format. - Providing the argument might help when libsox can not infer the format - from header or extension. - - Returns: - AudioMetaData: Metadata of the given audio. - """ - if not torch.jit.is_scripting(): - if hasattr(filepath, "read"): - raise RuntimeError("sox_io backend does not support file-like object.") - filepath = os.fspath(filepath) - sinfo = sox_ext.get_info(filepath, format) - return AudioMetaData(*sinfo) - - -def load( - filepath: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, -) -> Tuple[torch.Tensor, int]: - """Load audio data from file. - - Note: - This function can handle all the codecs that underlying libsox can handle, - however it is tested on the following formats; - - * WAV, AMB - - * 32-bit floating-point - * 32-bit signed integer - * 24-bit signed integer - * 16-bit signed integer - * 8-bit unsigned integer (WAV only) - - * MP3 - * FLAC - * OGG/VORBIS - * OPUS - * SPHERE - * AMR-NB - - To load ``MP3``, ``FLAC``, ``OGG/VORBIS``, ``OPUS`` and other codecs ``libsox`` does not - handle natively, your installation of ``torchaudio`` has to be linked to ``libsox`` - and corresponding codec libraries such as ``libmad`` or ``libmp3lame`` etc. - - By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with - ``float32`` dtype, and the shape of `[channel, time]`. - - .. warning:: - - ``normalize`` argument does not perform volume normalization. - It only converts the sample type to `torch.float32` from the native sample - type. - - When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit - signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, - this function can return integer Tensor, where the samples are expressed within the whole range - of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, - ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not - support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. - - ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as - ``flac`` and ``mp3``. - - For these formats, this function always returns ``float32`` Tensor with values. - - Args: - filepath (path-like object): Source of audio data. - frame_offset (int): - Number of frames to skip before start reading data. - num_frames (int, optional): - Maximum number of frames to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - This function may return the less number of frames if there is not enough - frames in the given file. - normalize (bool, optional): - When ``True``, this function converts the native sample type to ``float32``. - Default: ``True``. - - If input file is integer WAV, giving ``False`` will change the resulting Tensor type to - integer type. - This argument has no effect for formats other than integer WAV type. - - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Override the format detection with the given format. - Providing the argument might help when libsox can not infer the format - from header or extension. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - If the input file has integer wav format and ``normalize=False``, then it has - integer type, else ``float32`` type. If ``channels_first=True``, it has - `[channel, time]` else `[time, channel]`. - """ - if not torch.jit.is_scripting(): - if hasattr(filepath, "read"): - raise RuntimeError("sox_io backend does not support file-like object.") - filepath = os.fspath(filepath) - return sox_ext.load_audio_file(filepath, frame_offset, num_frames, normalize, channels_first, format) - - -def save( - filepath: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - compression: Optional[float] = None, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, -): - """Save audio data to file. - - Args: - filepath (path-like object): Path to save file. - src (torch.Tensor): Audio data to save. must be 2D tensor. - sample_rate (int): sampling rate - channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, - otherwise `[time, channel]`. - compression (float or None, optional): Used for formats other than WAV. - This corresponds to ``-C`` option of ``sox`` command. - - ``"mp3"`` - Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or - VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``. - - ``"flac"`` - Whole number from ``0`` to ``8``. ``8`` is default and highest compression. - - ``"ogg"``, ``"vorbis"`` - Number from ``-1`` to ``10``; ``-1`` is the highest compression - and lowest quality. Default: ``3``. - - See the detail at http://sox.sourceforge.net/soxformat.html. - format (str or None, optional): Override the audio format. - When ``filepath`` argument is path-like object, audio format is infered from - file extension. If file extension is missing or different, you can specify the - correct format with this argument. - - When ``filepath`` argument is file-like object, this argument is required. - - Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``, - ``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``. - - encoding (str or None, optional): Changes the encoding for the supported formats. - This argument is effective only for supported formats, such as ``"wav"``, ``""amb"`` - and ``"sph"``. Valid values are; - - - ``"PCM_S"`` (signed integer Linear PCM) - - ``"PCM_U"`` (unsigned integer Linear PCM) - - ``"PCM_F"`` (floating point PCM) - - ``"ULAW"`` (mu-law) - - ``"ALAW"`` (a-law) - - Default values - If not provided, the default value is picked based on ``format`` and ``bits_per_sample``. - - ``"wav"``, ``"amb"`` - - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the - | Tensor is used to determine the default value. - - - ``"PCM_U"`` if dtype is ``uint8`` - - ``"PCM_S"`` if dtype is ``int16`` or ``int32`` - - ``"PCM_F"`` if dtype is ``float32`` - - - ``"PCM_U"`` if ``bits_per_sample=8`` - - ``"PCM_S"`` otherwise - - ``"sph"`` format; - - the default value is ``"PCM_S"`` - - bits_per_sample (int or None, optional): Changes the bit depth for the supported formats. - When ``format`` is one of ``"wav"``, ``"flac"``, ``"sph"``, or ``"amb"``, you can change the - bit depth. Valid values are ``8``, ``16``, ``32`` and ``64``. - - Default Value; - If not provided, the default values are picked based on ``format`` and ``"encoding"``; - - ``"wav"``, ``"amb"``; - - | If both ``encoding`` and ``bits_per_sample`` are not provided, the ``dtype`` of the - | Tensor is used. - - - ``8`` if dtype is ``uint8`` - - ``16`` if dtype is ``int16`` - - ``32`` if dtype is ``int32`` or ``float32`` - - - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"`` - - ``16`` if ``encoding`` is ``"PCM_S"`` - - ``32`` if ``encoding`` is ``"PCM_F"`` - - ``"flac"`` format; - - the default value is ``24`` - - ``"sph"`` format; - - ``16`` if ``encoding`` is ``"PCM_U"``, ``"PCM_S"``, ``"PCM_F"`` or not provided. - - ``8`` if ``encoding`` is ``"ULAW"`` or ``"ALAW"`` - - ``"amb"`` format; - - ``8`` if ``encoding`` is ``"PCM_U"``, ``"ULAW"`` or ``"ALAW"`` - - ``16`` if ``encoding`` is ``"PCM_S"`` or not provided. - - ``32`` if ``encoding`` is ``"PCM_F"`` - - Supported formats/encodings/bit depth/compression are; - - ``"wav"``, ``"amb"`` - - 32-bit floating-point PCM - - 32-bit signed integer PCM - - 24-bit signed integer PCM - - 16-bit signed integer PCM - - 8-bit unsigned integer PCM - - 8-bit mu-law - - 8-bit a-law - - Note: Default encoding/bit depth is determined by the dtype of the input Tensor. - - ``"mp3"`` - Fixed bit rate (such as 128kHz) and variable bit rate compression. - Default: VBR with high quality. - - ``"flac"`` - - 8-bit - - 16-bit - - 24-bit (default) - - ``"ogg"``, ``"vorbis"`` - - Different quality level. Default: approx. 112kbps - - ``"sph"`` - - 8-bit signed integer PCM - - 16-bit signed integer PCM - - 24-bit signed integer PCM - - 32-bit signed integer PCM (default) - - 8-bit mu-law - - 8-bit a-law - - 16-bit a-law - - 24-bit a-law - - 32-bit a-law - - ``"amr-nb"`` - Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s - - ``"gsm"`` - Lossy Speech Compression, CPU intensive. - - ``"htk"`` - Uses a default single-channel 16-bit PCM format. - - Note: - To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``, - ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has - to be linked to ``libsox`` and corresponding codec libraries such as ``libmad`` - or ``libmp3lame`` etc. - """ - if not torch.jit.is_scripting(): - if hasattr(filepath, "write"): - raise RuntimeError("sox_io backend does not handle file-like object.") - filepath = os.fspath(filepath) - sox_ext.save_audio_file( - filepath, - src, - sample_rate, - channels_first, - compression, - format, - encoding, - bits_per_sample, - ) diff --git a/src/torchaudio/backend/common.py b/src/torchaudio/backend/common.py deleted file mode 100644 index 3f736bf401..0000000000 --- a/src/torchaudio/backend/common.py +++ /dev/null @@ -1,13 +0,0 @@ -def __getattr__(name: str): - if name == "AudioMetaData": - import warnings - - warnings.warn( - "`torchaudio.backend.common.AudioMetaData` has been moved to " - "`torchaudio.AudioMetaData`. Please update the import path.", - stacklevel=2, - ) - from torchaudio import AudioMetaData - - return AudioMetaData - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/torchaudio/backend/no_backend.py b/src/torchaudio/backend/no_backend.py deleted file mode 100644 index b5aad59a1c..0000000000 --- a/src/torchaudio/backend/no_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from . import _no_backend - - return getattr(_no_backend, name) diff --git a/src/torchaudio/backend/soundfile_backend.py b/src/torchaudio/backend/soundfile_backend.py deleted file mode 100644 index ef8612fc6e..0000000000 --- a/src/torchaudio/backend/soundfile_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from torchaudio._backend import soundfile_backend - - return getattr(soundfile_backend, name) diff --git a/src/torchaudio/backend/sox_io_backend.py b/src/torchaudio/backend/sox_io_backend.py deleted file mode 100644 index 7e83b8fbf4..0000000000 --- a/src/torchaudio/backend/sox_io_backend.py +++ /dev/null @@ -1,14 +0,0 @@ -def __getattr__(name: str): - import warnings - - warnings.warn( - "Torchaudio's I/O functions now support per-call backend dispatch. " - "Importing backend implementation directly is no longer guaranteed to work. " - "Please use `backend` keyword with load/save/info function, instead of " - "calling the underlying implementation directly.", - stacklevel=2, - ) - - from . import _sox_io_backend - - return getattr(_sox_io_backend, name) From 953fc6579960cb0339c41726e36e511aa31299c7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Wed, 13 Aug 2025 21:55:08 +0000 Subject: [PATCH 19/35] Support frame_offset and num_frames in load hack --- src/torchaudio/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 1ff3a530e4..592a2cbe6a 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -48,10 +48,18 @@ from torchaudio.utils import wav_utils def load( uri: str, + frame_offset: int = 0, + num_frames: int = -1, normalize: bool = True, channels_first: bool = True, ) -> Tuple[torch.Tensor, int]: - return wav_utils.load_wav(uri, normalize, channels_first) + data, sample_rate = wav_utils.load_wav(uri, normalize, channels_first=False) + if num_frames == -1: + num_frames = data.shape[0] - frame_offset + data = data[frame_offset:frame_offset+num_frames] + if channels_first: + data = data.transpose(0, 1) + return data, sample_rate def save( uri: str, From dd3ff90799685c8a98565d959c9204fba1cd5097 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 01:03:46 +0000 Subject: [PATCH 20/35] Use rand instead of randn for test_save_channels_first --- test/torchaudio_unittest/test_load_save_torchcodec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 3edb4c423b..90fcc15689 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -227,9 +227,9 @@ def test_save_channels_first(channels_first): """Test channels_first parameter.""" # Create test data if channels_first: - waveform = torch.randn(2, 16000) # [channel, time] + waveform = torch.rand(2, 16000) # [channel, time] else: - waveform = torch.randn(16000, 2) # [time, channel] + waveform = torch.rand(16000, 2) # [time, channel] sample_rate = 16000 From c94e011ecc5a64f0a550034011157f6cdee34f2d Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 14:38:27 +0000 Subject: [PATCH 21/35] Remove pytest-aware code in src --- src/torchaudio/__init__.py | 364 +++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 198 deletions(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 592a2cbe6a..0c321c96d2 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -42,204 +42,172 @@ except ImportError: pass -# CI cannot currently build with ffmpeg>4, but torchcodec is buggy with ffmpeg4. This hack -# allows CI to build with ffmpeg4 and works around load/test bugginess. -if "pytest" in sys.modules: - from torchaudio.utils import wav_utils - def load( - uri: str, - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - ) -> Tuple[torch.Tensor, int]: - data, sample_rate = wav_utils.load_wav(uri, normalize, channels_first=False) - if num_frames == -1: - num_frames = data.shape[0] - frame_offset - data = data[frame_offset:frame_offset+num_frames] - if channels_first: - data = data.transpose(0, 1) - return data, sample_rate - - def save( - uri: str, - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, - ): - wav_utils.save_wav(uri, src, sample_rate, channels_first=channels_first) -else: - def load( - uri: Union[BinaryIO, str, os.PathLike], - frame_offset: int = 0, - num_frames: int = -1, - normalize: bool = True, - channels_first: bool = True, - format: Optional[str] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - ) -> Tuple[torch.Tensor, int]: - """Load audio data from source using TorchCodec's AudioDecoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is - provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioDecoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. - Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and - ``backend`` are ignored and accepted only for backwards compatibility. - - - Args: - uri (path-like object or file-like object): - Source of audio data. The following types are accepted: - - * ``path-like``: File path or URL. - * ``file-like``: Object with ``read(size: int) -> bytes`` method. - - frame_offset (int, optional): - Number of samples to skip before start reading data. - num_frames (int, optional): - Maximum number of samples to read. ``-1`` reads all the remaining samples, - starting from ``frame_offset``. - normalize (bool, optional): - TorchCodec always returns normalized float32 samples. This parameter - is ignored and a warning is issued if set to False. - Default: ``True``. - channels_first (bool, optional): - When True, the returned Tensor has dimension `[channel, time]`. - Otherwise, the returned Tensor's dimension is `[time, channel]`. - format (str or None, optional): - Format hint for the decoder. May not be supported by all TorchCodec - decoders. (Default: ``None``) - buffer_size (int, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - backend (str or None, optional): - Not used by TorchCodec AudioDecoder. Provided for API compatibility. - - Returns: - (torch.Tensor, int): Resulting Tensor and sample rate. - Always returns float32 tensors. If ``channels_first=True``, shape is - `[channel, time]`, otherwise `[time, channel]`. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If unsupported parameters are used. - RuntimeError: If TorchCodec fails to decode the audio. - - Note: - - TorchCodec always returns normalized float32 samples, so the ``normalize`` - parameter has no effect. - - The ``buffer_size`` and ``backend`` parameters are ignored. - - Not all audio formats supported by torchaudio backends may be supported - by TorchCodec. - """ - return load_with_torchcodec( - uri, - frame_offset=frame_offset, - num_frames=num_frames, - normalize=normalize, - channels_first=channels_first, - format=format, - buffer_size=buffer_size, - backend=backend - ) - - def save( - uri: Union[str, os.PathLike], - src: torch.Tensor, - sample_rate: int, - channels_first: bool = True, - format: Optional[str] = None, - encoding: Optional[str] = None, - bits_per_sample: Optional[int] = None, - buffer_size: int = 4096, - backend: Optional[str] = None, - compression: Optional[Union[float, int]] = None, - ) -> None: - """Save audio data to file using TorchCodec's AudioEncoder. - - .. note:: - - As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. - It is provided for convenience, but we do recommend that you port your code to - natively use ``torchcodec``'s ``AudioEncoder`` class for better - performance: - https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. - Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, - ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for - backwards compatibility. - - Args: - uri (path-like object): - Path to save the audio file. The file extension determines the format. - - src (torch.Tensor): - Audio data to save. Must be a 1D or 2D tensor with float32 values - in the range [-1, 1]. If 2D, shape should be [channel, time] when - channels_first=True, or [time, channel] when channels_first=False. - - sample_rate (int): - Sample rate of the audio data. - - channels_first (bool, optional): - Indicates whether the input tensor has channels as the first dimension. - If True, expects [channel, time]. If False, expects [time, channel]. - Default: True. - - format (str or None, optional): - Audio format hint. Not used by TorchCodec (format is determined by - file extension). A warning is issued if provided. - Default: None. - - encoding (str or None, optional): - Audio encoding. Not fully supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - bits_per_sample (int or None, optional): - Bits per sample. Not directly supported by TorchCodec AudioEncoder. - A warning is issued if provided. Default: None. - - buffer_size (int, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if not default value. Default: 4096. - - backend (str or None, optional): - Not used by TorchCodec AudioEncoder. Provided for API compatibility. - A warning is issued if provided. Default: None. - - compression (float, int or None, optional): - Compression level or bit rate. Maps to bit_rate parameter in - TorchCodec AudioEncoder. Default: None. - - Raises: - ImportError: If torchcodec is not available. - ValueError: If input parameters are invalid. - RuntimeError: If TorchCodec fails to encode the audio. - - Note: - - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. - - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) - are not used by TorchCodec but are provided for API compatibility. - - The output format is determined by the file extension in the uri. - - TorchCodec uses FFmpeg under the hood for encoding. - """ - return save_with_torchcodec(uri, src, sample_rate, - channels_first=channels_first, - format=format, - encoding=encoding, - bits_per_sample=bits_per_sample, - buffer_size=buffer_size, - backend=backend, - compression=compression) + +def load( + uri: Union[BinaryIO, str, os.PathLike], + frame_offset: int = 0, + num_frames: int = -1, + normalize: bool = True, + channels_first: bool = True, + format: Optional[str] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, +) -> Tuple[torch.Tensor, int]: + """Load audio data from source using TorchCodec's AudioDecoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's decoding capabilities under the hood. It is + provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioDecoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder. + Because of the reliance on Torchcodec, the parameters ``normalize``, ``buffer_size``, and + ``backend`` are ignored and accepted only for backwards compatibility. + + + Args: + uri (path-like object or file-like object): + Source of audio data. The following types are accepted: + + * ``path-like``: File path or URL. + * ``file-like``: Object with ``read(size: int) -> bytes`` method. + + frame_offset (int, optional): + Number of samples to skip before start reading data. + num_frames (int, optional): + Maximum number of samples to read. ``-1`` reads all the remaining samples, + starting from ``frame_offset``. + normalize (bool, optional): + TorchCodec always returns normalized float32 samples. This parameter + is ignored and a warning is issued if set to False. + Default: ``True``. + channels_first (bool, optional): + When True, the returned Tensor has dimension `[channel, time]`. + Otherwise, the returned Tensor's dimension is `[time, channel]`. + format (str or None, optional): + Format hint for the decoder. May not be supported by all TorchCodec + decoders. (Default: ``None``) + buffer_size (int, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + backend (str or None, optional): + Not used by TorchCodec AudioDecoder. Provided for API compatibility. + + Returns: + (torch.Tensor, int): Resulting Tensor and sample rate. + Always returns float32 tensors. If ``channels_first=True``, shape is + `[channel, time]`, otherwise `[time, channel]`. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If unsupported parameters are used. + RuntimeError: If TorchCodec fails to decode the audio. + + Note: + - TorchCodec always returns normalized float32 samples, so the ``normalize`` + parameter has no effect. + - The ``buffer_size`` and ``backend`` parameters are ignored. + - Not all audio formats supported by torchaudio backends may be supported + by TorchCodec. + """ + return load_with_torchcodec( + uri, + frame_offset=frame_offset, + num_frames=num_frames, + normalize=normalize, + channels_first=channels_first, + format=format, + buffer_size=buffer_size, + backend=backend + ) + +def save( + uri: Union[str, os.PathLike], + src: torch.Tensor, + sample_rate: int, + channels_first: bool = True, + format: Optional[str] = None, + encoding: Optional[str] = None, + bits_per_sample: Optional[int] = None, + buffer_size: int = 4096, + backend: Optional[str] = None, + compression: Optional[Union[float, int]] = None, +) -> None: + """Save audio data to file using TorchCodec's AudioEncoder. + + .. note:: + + As of TorchAudio 2.9, this function relies on TorchCodec's encoding capabilities under the hood. + It is provided for convenience, but we do recommend that you port your code to + natively use ``torchcodec``'s ``AudioEncoder`` class for better + performance: + https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.encoders.AudioEncoder. + Because of the reliance on Torchcodec, the parameters ``format``, ``encoding``, + ``bits_per_sample``, ``buffer_size``, and ``backend``, are ignored and accepted only for + backwards compatibility. + + Args: + uri (path-like object): + Path to save the audio file. The file extension determines the format. + + src (torch.Tensor): + Audio data to save. Must be a 1D or 2D tensor with float32 values + in the range [-1, 1]. If 2D, shape should be [channel, time] when + channels_first=True, or [time, channel] when channels_first=False. + + sample_rate (int): + Sample rate of the audio data. + + channels_first (bool, optional): + Indicates whether the input tensor has channels as the first dimension. + If True, expects [channel, time]. If False, expects [time, channel]. + Default: True. + + format (str or None, optional): + Audio format hint. Not used by TorchCodec (format is determined by + file extension). A warning is issued if provided. + Default: None. + + encoding (str or None, optional): + Audio encoding. Not fully supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + bits_per_sample (int or None, optional): + Bits per sample. Not directly supported by TorchCodec AudioEncoder. + A warning is issued if provided. Default: None. + + buffer_size (int, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if not default value. Default: 4096. + + backend (str or None, optional): + Not used by TorchCodec AudioEncoder. Provided for API compatibility. + A warning is issued if provided. Default: None. + + compression (float, int or None, optional): + Compression level or bit rate. Maps to bit_rate parameter in + TorchCodec AudioEncoder. Default: None. + + Raises: + ImportError: If torchcodec is not available. + ValueError: If input parameters are invalid. + RuntimeError: If TorchCodec fails to encode the audio. + + Note: + - TorchCodec AudioEncoder expects float32 samples in [-1, 1] range. + - Some parameters (format, encoding, bits_per_sample, buffer_size, backend) + are not used by TorchCodec but are provided for API compatibility. + - The output format is determined by the file extension in the uri. + - TorchCodec uses FFmpeg under the hood for encoding. + """ + return save_with_torchcodec(uri, src, sample_rate, + channels_first=channels_first, + format=format, + encoding=encoding, + bits_per_sample=bits_per_sample, + buffer_size=buffer_size, + backend=backend, + compression=compression) __all__ = [ "AudioMetaData", From b622d8209299382dbd40d14adaa069cf217c0df4 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 15:08:06 +0000 Subject: [PATCH 22/35] Remove torchcodec version check --- .github/scripts/unittest-linux/install.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index a7ae9bfcf4..c8f47e63ab 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -88,7 +88,6 @@ pip install . -v --no-build-isolation printf "* Installing test tools\n" # On this CI, for whatever reason, we're only able to install ffmpeg 4. conda install -y "ffmpeg<5" -python -c "import torch; import torchaudio; import torchcodec; print(torch.__version__, torchaudio.__version__, torchcodec.__version__)" NUMBA_DEV_CHANNEL="" if [[ "$(python --version)" = *3.9* || "$(python --version)" = *3.10* ]]; then From 93351a24194727341be4b203f6618c9baadbccc7 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 15:58:18 +0000 Subject: [PATCH 23/35] Fix bugs in torchcodec mock --- test/conftest.py | 4 + .../common_utils/__init__.py | 2 +- .../common_utils/wav_utils.py | 92 +++++++++++++++++++ test/torchcodec/decoders.py | 17 ++-- test/torchcodec/encoders.py | 6 +- 5 files changed, 106 insertions(+), 15 deletions(-) create mode 100644 test/conftest.py create mode 100644 test/torchaudio_unittest/common_utils/wav_utils.py diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000000..35f7ae81ee --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,4 @@ +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).parent.resolve())) diff --git a/test/torchaudio_unittest/common_utils/__init__.py b/test/torchaudio_unittest/common_utils/__init__.py index 93ac7e0821..509d5208df 100644 --- a/test/torchaudio_unittest/common_utils/__init__.py +++ b/test/torchaudio_unittest/common_utils/__init__.py @@ -26,7 +26,7 @@ from .func_utils import torch_script from .image_utils import get_image, rgb_to_gray, rgb_to_yuv_ccir, save_image from .parameterized_utils import load_params, nested_params -from torchaudio.utils.wav_utils import get_wav_data, load_wav, normalize_wav, save_wav +from .wav_utils import get_wav_data, load_wav, normalize_wav, save_wav import pytest class RequestMixin: diff --git a/test/torchaudio_unittest/common_utils/wav_utils.py b/test/torchaudio_unittest/common_utils/wav_utils.py new file mode 100644 index 0000000000..db15494dca --- /dev/null +++ b/test/torchaudio_unittest/common_utils/wav_utils.py @@ -0,0 +1,92 @@ +from typing import Optional + +import scipy.io.wavfile +import torch + + +def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: + if tensor.dtype == torch.float32: + pass + elif tensor.dtype == torch.int32: + tensor = tensor.to(torch.float32) + tensor[tensor > 0] /= 2147483647.0 + tensor[tensor < 0] /= 2147483648.0 + elif tensor.dtype == torch.int16: + tensor = tensor.to(torch.float32) + tensor[tensor > 0] /= 32767.0 + tensor[tensor < 0] /= 32768.0 + elif tensor.dtype == torch.uint8: + tensor = tensor.to(torch.float32) - 128 + tensor[tensor > 0] /= 127.0 + tensor[tensor < 0] /= 128.0 + return tensor + + +def get_wav_data( + dtype: str, + num_channels: int, + *, + num_frames: Optional[int] = None, + normalize: bool = True, + channels_first: bool = True, +): + """Generate linear signal of the given dtype and num_channels + + Data range is + [-1.0, 1.0] for float32, + [-2147483648, 2147483647] for int32 + [-32768, 32767] for int16 + [0, 255] for uint8 + + num_frames allow to change the linear interpolation parameter. + Default values are 256 for uint8, else 1 << 16. + 1 << 16 as default is so that int16 value range is completely covered. + """ + dtype_ = getattr(torch, dtype) + + if num_frames is None: + if dtype == "uint8": + num_frames = 256 + else: + num_frames = 1 << 16 + + if dtype == "uint8": + base = torch.linspace(0, 255, num_frames, dtype=dtype_) + elif dtype == "int8": + base = torch.linspace(-128, 127, num_frames, dtype=dtype_) + elif dtype == "float32": + base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "float64": + base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) + elif dtype == "int32": + base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) + elif dtype == "int16": + base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) + else: + raise NotImplementedError(f"Unsupported dtype {dtype}") + data = base.repeat([num_channels, 1]) + if not channels_first: + data = data.transpose(1, 0) + if normalize: + data = normalize_wav(data) + return data + + +def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor: + """Load wav file without torchaudio""" + sample_rate, data = scipy.io.wavfile.read(path) + data = torch.from_numpy(data.copy()) + if data.ndim == 1: + data = data.unsqueeze(1) + if normalize: + data = normalize_wav(data) + if channels_first: + data = data.transpose(1, 0) + return data, sample_rate + + +def save_wav(path, data, sample_rate, channels_first=True): + """Save wav file without torchaudio""" + if channels_first: + data = data.transpose(1, 0) + scipy.io.wavfile.write(path, sample_rate, data.numpy()) diff --git a/test/torchcodec/decoders.py b/test/torchcodec/decoders.py index 94f2d8c8c1..8b2a7a3071 100644 --- a/test/torchcodec/decoders.py +++ b/test/torchcodec/decoders.py @@ -1,17 +1,12 @@ -import test.torchaudio_unittest.common_utils.wav_utils as wav_utils +import torchaudio_unittest.common_utils.wav_utils as wav_utils +from types import SimpleNamespace class AudioDecoder: def __init__(self, uri): self.uri = uri - - def get_all_samples(self): - return wav_utils.load_wav(self.uri) - - -class AudioEncoder: - def __init__(self, data, sample_rate): + data, sample_rate = wav_utils.load_wav(self.uri) + self.metadata = SimpleNamespace(sample_rate=sample_rate) self.data = data - self.sample_rate = sample_rate - def to_file(self, uri, bit_rate=None): - return wav_utils.save_wav(uri, self.data, self.sample_rate) + def get_all_samples(self): + return SimpleNamespace(data=self.data) diff --git a/test/torchcodec/encoders.py b/test/torchcodec/encoders.py index 5e9cc54968..cef6953824 100644 --- a/test/torchcodec/encoders.py +++ b/test/torchcodec/encoders.py @@ -1,10 +1,10 @@ import torchaudio_unittest.common_utils.wav_utils as wav_utils +from types import SimpleNamespace class AudioEncoder: def __init__(self, data, sample_rate): - print("BEING CALLED") self.data = data - self.sample_rate = sample_rate + self.metadata = SimpleNamespace(sample_rate=sample_rate) def to_file(self, uri, bit_rate=None): - return wav_utils.save_wav(uri, self.data, self.sample_rate) + return wav_utils.save_wav(uri, self.data, self.metadata.sample_rate) From 54071630c957e3eab5dc271f5e9bb5dd25e3d67c Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:01:18 +0000 Subject: [PATCH 24/35] Skip test_load_save_torchcodec --- .../test_load_save_torchcodec.py | 152 +++++++++--------- 1 file changed, 78 insertions(+), 74 deletions(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 90fcc15689..28d316952e 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -12,6 +12,10 @@ from torchaudio import load_with_torchcodec, save_with_torchcodec from torchaudio_unittest.common_utils import get_asset_path +# Now, load/save_torchcodec are the same as torchaudio.load/save, so +# there is no need to test this. +pytest.skip() + def get_ffmpeg_version(): """Get FFmpeg version to check for compatibility issues.""" try: @@ -48,25 +52,25 @@ def test_basic_load(filename): # Skip problematic files on FFmpeg4 due to known compatibility issues if is_ffmpeg4() and filename != "sinewave.wav": pytest.skip("FFmpeg4 has known compatibility issues with some audio files") - + file_path = get_asset_path(*filename.split("/")) - + # Load with torchaudio waveform_ta, sample_rate_ta = torchaudio.load(file_path) - + # Load with torchcodec waveform_tc, sample_rate_tc = load_with_torchcodec(file_path) - + # Check sample rates match assert sample_rate_ta == sample_rate_tc - + # Check shapes match assert waveform_ta.shape == waveform_tc.shape - + # Check data types (should both be float32) assert waveform_ta.dtype == torch.float32 assert waveform_tc.dtype == torch.float32 - + # Check values are close (allowing for small differences in decoders) torch.testing.assert_close(waveform_ta, waveform_tc) @@ -79,17 +83,17 @@ def test_basic_load(filename): def test_frame_offset_and_num_frames(frame_offset, num_frames): """Test frame_offset and num_frames parameters.""" file_path = get_asset_path("sinewave.wav") - + # Load with torchaudio waveform_ta, sample_rate_ta = torchaudio.load( file_path, frame_offset=frame_offset, num_frames=num_frames ) - + # Load with torchcodec waveform_tc, sample_rate_tc = load_with_torchcodec( file_path, frame_offset=frame_offset, num_frames=num_frames ) - + # Check results match assert sample_rate_ta == sample_rate_tc assert waveform_ta.shape == waveform_tc.shape @@ -98,21 +102,21 @@ def test_frame_offset_and_num_frames(frame_offset, num_frames): def test_channels_first(): """Test channels_first parameter.""" file_path = get_asset_path("sinewave.wav") # Use sinewave.wav for compatibility - + # Test channels_first=True (default) waveform_cf_true, sample_rate = load_with_torchcodec(file_path, channels_first=True) - + # Test channels_first=False waveform_cf_false, _ = load_with_torchcodec(file_path, channels_first=False) - + # Check that transpose relationship holds assert waveform_cf_true.shape == waveform_cf_false.transpose(0, 1).shape torch.testing.assert_close(waveform_cf_true, waveform_cf_false.transpose(0, 1)) - + # Compare with torchaudio waveform_ta_true, _ = torchaudio.load(file_path, channels_first=True) waveform_ta_false, _ = torchaudio.load(file_path, channels_first=False) - + assert waveform_cf_true.shape == waveform_ta_true.shape assert waveform_cf_false.shape == waveform_ta_false.shape torch.testing.assert_close(waveform_cf_true, waveform_ta_true) @@ -121,18 +125,18 @@ def test_channels_first(): def test_normalize_parameter_warning(): """Test that normalize=False produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="normalize=False.*ignored"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, normalize=False) - + # Result should still be float32 (normalized) assert waveform.dtype == torch.float32 def test_buffer_size_parameter_warning(): """Test that non-default buffer_size produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="buffer_size.*not used"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, buffer_size=8192) @@ -141,7 +145,7 @@ def test_buffer_size_parameter_warning(): def test_backend_parameter_warning(): """Test that specifying backend produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="backend.*not used"): # This should produce a warning waveform, sample_rate = load_with_torchcodec(file_path, backend="ffmpeg") @@ -156,10 +160,10 @@ def test_invalid_file(): def test_format_parameter(): """Test that format parameter produces a warning.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns(UserWarning, match="format.*not supported"): waveform, sample_rate = load_with_torchcodec(file_path, format="wav") - + # Check basic properties assert waveform.dtype == torch.float32 assert sample_rate > 0 @@ -168,17 +172,17 @@ def test_format_parameter(): def test_multiple_warnings(): """Test that multiple unsupported parameters produce multiple warnings.""" file_path = get_asset_path("sinewave.wav") - + with pytest.warns() as warning_list: # This should produce multiple warnings waveform, sample_rate = load_with_torchcodec( - file_path, - normalize=False, - buffer_size=8192, + file_path, + normalize=False, + buffer_size=8192, backend="ffmpeg" ) - - + + # Check that expected warnings are present messages = [str(w.message) for w in warning_list] assert any("normalize=False" in msg for msg in messages) @@ -194,30 +198,30 @@ def test_save_basic_save(filename): # Load a test file first file_path = get_asset_path(*filename.split("/")) waveform, sample_rate = torchaudio.load(file_path) - + with tempfile.TemporaryDirectory() as temp_dir: # Save with torchaudio ta_path = os.path.join(temp_dir, "ta_output.wav") torchaudio.save(ta_path, waveform, sample_rate) - + # Save with torchcodec tc_path = os.path.join(temp_dir, "tc_output.wav") save_with_torchcodec(tc_path, waveform, sample_rate) - + # Load both back and compare waveform_ta, sample_rate_ta = torchaudio.load(ta_path) waveform_tc, sample_rate_tc = torchaudio.load(tc_path) - + # Check sample rates match assert sample_rate_ta == sample_rate_tc - + # Check shapes match assert waveform_ta.shape == waveform_tc.shape - + # Check data types (should both be float32) assert waveform_ta.dtype == torch.float32 assert waveform_tc.dtype == torch.float32 - + # Check values are close (allowing for small differences in encoders) torch.testing.assert_close(waveform_ta, waveform_tc, atol=1e-3, rtol=1e-3) @@ -230,22 +234,22 @@ def test_save_channels_first(channels_first): waveform = torch.rand(2, 16000) # [channel, time] else: waveform = torch.rand(16000, 2) # [time, channel] - + sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Save with torchaudio ta_path = os.path.join(temp_dir, "ta_output.wav") torchaudio.save(ta_path, waveform, sample_rate, channels_first=channels_first) - + # Save with torchcodec tc_path = os.path.join(temp_dir, "tc_output.wav") save_with_torchcodec(tc_path, waveform, sample_rate, channels_first=channels_first) - + # Load both back and compare waveform_ta, sample_rate_ta = torchaudio.load(ta_path) waveform_tc, sample_rate_tc = torchaudio.load(tc_path) - + # Check results match assert sample_rate_ta == sample_rate_tc assert waveform_ta.shape == waveform_tc.shape @@ -256,15 +260,15 @@ def test_save_compression_parameter(): """Test compression parameter (maps to bit_rate).""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Test with compression (bit_rate) output_path = os.path.join(temp_dir, "output.wav") save_with_torchcodec(output_path, waveform, sample_rate, compression=128000) - + # Should not raise an error and file should exist assert os.path.exists(output_path) - + # Load back and check basic properties waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) assert sample_rate_loaded == sample_rate @@ -275,13 +279,13 @@ def test_save_format_parameter_warning(): """Test that format parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="format.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, format="wav") - + # Should still work despite warning assert os.path.exists(output_path) @@ -290,13 +294,13 @@ def test_save_encoding_parameter_warning(): """Test that encoding parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="encoding.*not fully supported"): save_with_torchcodec(output_path, waveform, sample_rate, encoding="PCM_16") - + # Should still work despite warning assert os.path.exists(output_path) @@ -305,13 +309,13 @@ def test_save_bits_per_sample_parameter_warning(): """Test that bits_per_sample parameter produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="bits_per_sample.*not directly supported"): save_with_torchcodec(output_path, waveform, sample_rate, bits_per_sample=16) - + # Should still work despite warning assert os.path.exists(output_path) @@ -320,13 +324,13 @@ def test_save_buffer_size_parameter_warning(): """Test that non-default buffer_size produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="buffer_size.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, buffer_size=8192) - + # Should still work despite warning assert os.path.exists(output_path) @@ -335,13 +339,13 @@ def test_save_backend_parameter_warning(): """Test that specifying backend produces a warning.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns(UserWarning, match="backend.*not used"): save_with_torchcodec(output_path, waveform, sample_rate, backend="ffmpeg") - + # Should still work despite warning assert os.path.exists(output_path) @@ -350,16 +354,16 @@ def test_save_edge_cases(): """Test edge cases and error conditions.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + # Test with very small waveform small_waveform = torch.randn(1, 10) save_with_torchcodec(output_path, small_waveform, sample_rate) waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) assert sample_rate_loaded == sample_rate - + # Test with different sample rates for sr in [8000, 22050, 44100]: sr_path = os.path.join(temp_dir, f"output_{sr}.wav") @@ -372,19 +376,19 @@ def test_save_invalid_inputs(): """Test that invalid inputs raise appropriate errors.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + # Test with invalid sample rate with pytest.raises(ValueError, match="sample_rate must be positive"): save_with_torchcodec(output_path, waveform, -1) - + # Test with invalid tensor dimensions with pytest.raises(ValueError, match="Expected 1D or 2D tensor"): invalid_waveform = torch.randn(1, 2, 16000) # 3D tensor save_with_torchcodec(output_path, invalid_waveform, sample_rate) - + # Test with non-tensor input with pytest.raises(ValueError, match="Expected src to be a torch.Tensor"): save_with_torchcodec(output_path, [1, 2, 3], sample_rate) @@ -394,14 +398,14 @@ def test_save_multiple_warnings(): """Test that multiple unsupported parameters produce multiple warnings.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: output_path = os.path.join(temp_dir, "output.wav") - + with pytest.warns() as warning_list: save_with_torchcodec( - output_path, - waveform, + output_path, + waveform, sample_rate, format="wav", encoding="PCM_16", @@ -409,7 +413,7 @@ def test_save_multiple_warnings(): buffer_size=8192, backend="ffmpeg" ) - + # Check that expected warnings are present messages = [str(w.message) for w in warning_list] assert any("format" in msg for msg in messages) @@ -417,7 +421,7 @@ def test_save_multiple_warnings(): assert any("bits_per_sample" in msg for msg in messages) assert any("buffer_size" in msg for msg in messages) assert any("backend" in msg for msg in messages) - + # Should still work despite warnings assert os.path.exists(output_path) @@ -426,17 +430,17 @@ def test_save_different_formats(): """Test saving to different audio formats.""" waveform = torch.randn(1, 16000) sample_rate = 16000 - + with tempfile.TemporaryDirectory() as temp_dir: # Test common formats formats = ["wav", "mp3", "flac"] - + for fmt in formats: output_path = os.path.join(temp_dir, f"output.{fmt}") try: save_with_torchcodec(output_path, waveform, sample_rate) assert os.path.exists(output_path) - + # Try to load back (may not work for all formats with all backends) try: waveform_loaded, sample_rate_loaded = torchaudio.load(output_path) @@ -446,4 +450,4 @@ def test_save_different_formats(): pass except Exception as e: # Some formats might not be supported by torchcodec - pytest.skip(f"Format {fmt} not supported: {e}") \ No newline at end of file + pytest.skip(f"Format {fmt} not supported: {e}") From bd7eb5239badb3a4858c5820ff606bf691dcaeff Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:33:48 +0000 Subject: [PATCH 25/35] Correct call to pytest skip --- test/torchaudio_unittest/test_load_save_torchcodec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/test_load_save_torchcodec.py b/test/torchaudio_unittest/test_load_save_torchcodec.py index 28d316952e..4a89123939 100644 --- a/test/torchaudio_unittest/test_load_save_torchcodec.py +++ b/test/torchaudio_unittest/test_load_save_torchcodec.py @@ -14,7 +14,7 @@ # Now, load/save_torchcodec are the same as torchaudio.load/save, so # there is no need to test this. -pytest.skip() +pytest.skip(allow_module_level=True) def get_ffmpeg_version(): """Get FFmpeg version to check for compatibility issues.""" From c3d0cc2bca81a9815e0592683347048562d33c16 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Thu, 14 Aug 2025 16:57:21 +0000 Subject: [PATCH 26/35] Remove torchcodec installation --- .github/scripts/unittest-linux/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/unittest-linux/install.sh b/.github/scripts/unittest-linux/install.sh index c8f47e63ab..68ed032bbb 100755 --- a/.github/scripts/unittest-linux/install.sh +++ b/.github/scripts/unittest-linux/install.sh @@ -74,7 +74,7 @@ case $GPU_ARCH_TYPE in ;; esac PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${GPU_ARCH_ID}" -pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" +pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" # 2. Install torchaudio From d10fc1925e38c5f1abec5753c5f11987e338e2e9 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 15 Aug 2025 15:57:04 +0000 Subject: [PATCH 27/35] Add torchcodec to build installation --- .github/workflows/build_docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index e92c556218..f681e3b7ec 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -68,7 +68,7 @@ jobs: GPU_ARCH_ID=cu126 # This is hard-coded and must be consistent with gpu-arch-version. PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" - pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + pip install --progress-bar=off --pre torch torchcodec --index-url="${PYTORCH_WHEEL_INDEX}" echo "::endgroup::" echo "::group::Install TorchAudio" From 92fee5133bd585b43f96bcf3985a61806fee6f33 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Fri, 15 Aug 2025 16:48:41 +0000 Subject: [PATCH 28/35] Remove redundant wav_utils --- src/torchaudio/utils/wav_utils.py | 92 ------------------------------- 1 file changed, 92 deletions(-) delete mode 100644 src/torchaudio/utils/wav_utils.py diff --git a/src/torchaudio/utils/wav_utils.py b/src/torchaudio/utils/wav_utils.py deleted file mode 100644 index db15494dca..0000000000 --- a/src/torchaudio/utils/wav_utils.py +++ /dev/null @@ -1,92 +0,0 @@ -from typing import Optional - -import scipy.io.wavfile -import torch - - -def normalize_wav(tensor: torch.Tensor) -> torch.Tensor: - if tensor.dtype == torch.float32: - pass - elif tensor.dtype == torch.int32: - tensor = tensor.to(torch.float32) - tensor[tensor > 0] /= 2147483647.0 - tensor[tensor < 0] /= 2147483648.0 - elif tensor.dtype == torch.int16: - tensor = tensor.to(torch.float32) - tensor[tensor > 0] /= 32767.0 - tensor[tensor < 0] /= 32768.0 - elif tensor.dtype == torch.uint8: - tensor = tensor.to(torch.float32) - 128 - tensor[tensor > 0] /= 127.0 - tensor[tensor < 0] /= 128.0 - return tensor - - -def get_wav_data( - dtype: str, - num_channels: int, - *, - num_frames: Optional[int] = None, - normalize: bool = True, - channels_first: bool = True, -): - """Generate linear signal of the given dtype and num_channels - - Data range is - [-1.0, 1.0] for float32, - [-2147483648, 2147483647] for int32 - [-32768, 32767] for int16 - [0, 255] for uint8 - - num_frames allow to change the linear interpolation parameter. - Default values are 256 for uint8, else 1 << 16. - 1 << 16 as default is so that int16 value range is completely covered. - """ - dtype_ = getattr(torch, dtype) - - if num_frames is None: - if dtype == "uint8": - num_frames = 256 - else: - num_frames = 1 << 16 - - if dtype == "uint8": - base = torch.linspace(0, 255, num_frames, dtype=dtype_) - elif dtype == "int8": - base = torch.linspace(-128, 127, num_frames, dtype=dtype_) - elif dtype == "float32": - base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) - elif dtype == "float64": - base = torch.linspace(-1.0, 1.0, num_frames, dtype=dtype_) - elif dtype == "int32": - base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_) - elif dtype == "int16": - base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_) - else: - raise NotImplementedError(f"Unsupported dtype {dtype}") - data = base.repeat([num_channels, 1]) - if not channels_first: - data = data.transpose(1, 0) - if normalize: - data = normalize_wav(data) - return data - - -def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor: - """Load wav file without torchaudio""" - sample_rate, data = scipy.io.wavfile.read(path) - data = torch.from_numpy(data.copy()) - if data.ndim == 1: - data = data.unsqueeze(1) - if normalize: - data = normalize_wav(data) - if channels_first: - data = data.transpose(1, 0) - return data, sample_rate - - -def save_wav(path, data, sample_rate, channels_first=True): - """Save wav file without torchaudio""" - if channels_first: - data = data.transpose(1, 0) - scipy.io.wavfile.write(path, sample_rate, data.numpy()) From 8ac07208bafb6b576fc6dbfd1dec37aaffed3502 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Mon, 18 Aug 2025 18:08:44 +0000 Subject: [PATCH 29/35] Remove io export --- src/torchaudio/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index 3d67af5945..c3545855ac 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -199,7 +199,6 @@ def save( "load_with_torchcodec", "save_with_torchcodec", "save", - "io", "compliance", "datasets", "functional", From 6b7d78c966f9d9eb51dda363661515776e3f1dc2 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Mon, 18 Aug 2025 18:32:37 +0000 Subject: [PATCH 30/35] Remove io import --- src/torchaudio/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/torchaudio/__init__.py b/src/torchaudio/__init__.py index c3545855ac..f57572e5c8 100644 --- a/src/torchaudio/__init__.py +++ b/src/torchaudio/__init__.py @@ -12,7 +12,6 @@ compliance, datasets, functional, - io, kaldi_io, models, pipelines, From f2c21e6c06b49d5612cbea06e4baeb69b74a26ed Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Mon, 18 Aug 2025 19:25:58 +0000 Subject: [PATCH 31/35] Remove torchaudio.io references in docs --- docs/source/_templates/autosummary/io.rst | 19 ------ .../_templates/autosummary/io_class.rst | 59 ------------------- docs/source/installation.rst | 5 +- docs/source/io.rst | 29 --------- docs/source/torio.io.rst | 2 - 5 files changed, 1 insertion(+), 113 deletions(-) delete mode 100644 docs/source/_templates/autosummary/io.rst delete mode 100644 docs/source/_templates/autosummary/io_class.rst delete mode 100644 docs/source/io.rst diff --git a/docs/source/_templates/autosummary/io.rst b/docs/source/_templates/autosummary/io.rst deleted file mode 100644 index 120348bacf..0000000000 --- a/docs/source/_templates/autosummary/io.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. - autogenerated from source/_templates/autosummary/io.rst - -{{ fullname | underline }} - -.. autofunction:: {{ fullname }} - - -{%- if name == "info" %} - -Support Structure ------------------ - -AudioMetaData -~~~~~~~~~~~~~ - -.. autoclass:: torchaudio.AudioMetaData - -{%- endif %} diff --git a/docs/source/_templates/autosummary/io_class.rst b/docs/source/_templates/autosummary/io_class.rst deleted file mode 100644 index 1b748d93ff..0000000000 --- a/docs/source/_templates/autosummary/io_class.rst +++ /dev/null @@ -1,59 +0,0 @@ -.. - autogenerated from source/_templates/autosummary/io_class.rst - -{#- - ################################################################################ - # autosummary template for torchaudio.io module - # Since StreamReader/StreamWriter have many methods/properties, - # we want to list them up in the table of contents. - # The default class template does not do this, so we use custom one here. - ################################################################################ -#} - -{{ name | underline }} - -.. autoclass:: {{ fullname }} - -{%- if name not in ["StreamReader", "StreamWriter"] %} - -{%- if attributes %} - -Properties ----------- - -{%- for item in attributes %} -{%- if not item.startswith('_') and item not in inherited_members %} - -{{ item | underline("~") }} - -.. container:: py attribute - - .. autoproperty:: {{[fullname, item] | join('.')}} - -{%- endif %} -{%- endfor %} -{%- endif %} - -{%- if members %} - -Methods -------- - -{%- for item in members %} -{%- if - not item.startswith('_') - and item not in inherited_members - and item not in attributes - %} - -{{ item | underline("~") }} - -.. container:: py attribute - - .. automethod:: {{[fullname, item] | join('.')}} - -{%- endif %} -{%- endfor %} -{%- endif %} - -{%- endif %} diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 7fc036c592..cb0fa190b8 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -34,9 +34,6 @@ Optional Dependencies * `FFmpeg `__ - Required to use :py:mod:`torchaudio.io` module. and ``backend="ffmpeg"`` in - `I/O functions <./torchaudio.html#i-o>`__. - Starting version 2.1, TorchAudio official binary distributions are compatible with FFmpeg version 6, 5 and 4. (>=4.4, <7). At runtime, TorchAudio first looks for FFmpeg 6, if not found, then it continues to looks for 5 and move on to 4. @@ -111,7 +108,7 @@ Optional Dependencies Required to use :py:mod:`torchaudio.kaldi_io` module. - + Compatibility Matrix -------------------- diff --git a/docs/source/io.rst b/docs/source/io.rst deleted file mode 100644 index 11e3c0c32c..0000000000 --- a/docs/source/io.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. py:module:: torchaudio.io - -torchaudio.io -============= - -.. warning:: - Starting with version 2.8, we are refactoring TorchAudio to transition it - into a maintenance phase. As a result: - - - The ``torchaudio.io`` module is deprecated in 2.8 and will be removed in 2.9. - - The decoding and encoding capabilities of PyTorch for both audio and video - are being consolidated into TorchCodec. - - Please see https://github.com/pytorch/audio/issues/3902 for more information. - -.. currentmodule:: torchaudio.io - -.. autosummary:: - :toctree: generated - :nosignatures: - :template: autosummary/io_class.rst - - StreamReader - StreamWriter - play_audio - -.. rubric:: Tutorials using ``torchaudio.io`` - -.. minigallery:: torchaudio.io diff --git a/docs/source/torio.io.rst b/docs/source/torio.io.rst index eb41c71259..a20b23f95f 100644 --- a/docs/source/torio.io.rst +++ b/docs/source/torio.io.rst @@ -26,5 +26,3 @@ torio.io .. rubric:: Tutorials using ``torio.io`` .. minigallery:: torio.io - -.. minigallery:: torchaudio.io From 7ef8c69e8db93778a5ae2304e39191eb85c18645 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 19 Aug 2025 17:33:36 +0000 Subject: [PATCH 32/35] Remove some torio references --- cmake/TorchAudioHelper.cmake | 21 ------------------- .../common_utils/case_utils.py | 3 --- 2 files changed, 24 deletions(-) diff --git a/cmake/TorchAudioHelper.cmake b/cmake/TorchAudioHelper.cmake index d000483e37..3553da8301 100644 --- a/cmake/TorchAudioHelper.cmake +++ b/cmake/TorchAudioHelper.cmake @@ -41,17 +41,6 @@ function(torchaudio_library name source include_dirs link_libraries compile_defs ) endfunction() -function(torio_library name source include_dirs link_libraries compile_defs) - _library( - torio/lib - "${name}" - "${source}" - "${include_dirs}" - "${link_libraries}" - "${compile_defs}" - ) -endfunction() - if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) # See https://github.com/pytorch/pytorch/issues/38122 find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib") @@ -103,16 +92,6 @@ if (BUILD_TORCHAUDIO_PYTHON_EXTENSION) "${definitions}" ) endfunction() - function(torio_extension name sources include_dirs libraries definitions) - _extension( - torio/lib - "${name}" - "${sources}" - "${include_dirs}" - "${libraries}" - "${definitions}" - ) - endfunction() endif() diff --git a/test/torchaudio_unittest/common_utils/case_utils.py b/test/torchaudio_unittest/common_utils/case_utils.py index b99b96f5b0..64bbfed64e 100644 --- a/test/torchaudio_unittest/common_utils/case_utils.py +++ b/test/torchaudio_unittest/common_utils/case_utils.py @@ -10,7 +10,6 @@ import torch import torchaudio -import torio from torch.testing._internal.common_utils import TestCase as PytorchTestCase from torchaudio._internal.module_utils import eval_env, is_module_available from torchaudio.utils.ffmpeg_utils import get_video_decoders, get_video_encoders @@ -108,8 +107,6 @@ class TorchaudioTestCase(TestBaseMixin, PytorchTestCase): pass -_IS_FFMPEG_AVAILABLE = torio._extension.lazy_import_ffmpeg_ext().is_available() -_IS_SOX_AVAILABLE = torchaudio._extension.lazy_import_sox_ext().is_available() _IS_CTC_DECODER_AVAILABLE = None _IS_CUDA_CTC_DECODER_AVAILABLE = None From a9123a9485101111ac0d49e8597ddaffc85db9d2 Mon Sep 17 00:00:00 2001 From: Sam Anklesaria Date: Tue, 19 Aug 2025 17:39:47 +0000 Subject: [PATCH 33/35] Remove docs references to libtorio --- docs/Makefile | 1 - docs/source/Doxyfile | 2727 ------------------------ docs/source/index.rst | 17 - docs/source/libtorio.stream_writer.rst | 86 - src/torchaudio/utils/__init__.py | 2 - src/torchaudio/utils/ffmpeg_utils.py | 11 - 6 files changed, 2844 deletions(-) delete mode 100644 docs/source/Doxyfile delete mode 100644 docs/source/libtorio.stream_writer.rst delete mode 100644 src/torchaudio/utils/ffmpeg_utils.py diff --git a/docs/Makefile b/docs/Makefile index fd3f719262..02fc7eda9a 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -24,7 +24,6 @@ docset: html # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile -# doxygen source/Doxyfile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @python post_process_dispatcher.py $(BUILDDIR) diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile deleted file mode 100644 index 73a2ab8f0d..0000000000 --- a/docs/source/Doxyfile +++ /dev/null @@ -1,2727 +0,0 @@ -# Doxyfile 1.9.5 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a double hash (##) is considered a comment and is placed in -# front of the TAG it is preceding. -# -# All text after a single hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists, items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (\" \"). -# -# Note: -# -# Use doxygen to compare the used configuration file with the template -# configuration file: -# doxygen -x [configFile] -# Use doxygen to compare the used configuration file with the template -# configuration file without replacing the environment variables or CMake type -# replacement variables: -# doxygen -x_noenv [configFile] - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the configuration -# file that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# https://www.gnu.org/software/libiconv/ for the list of possible encodings. -# The default value is: UTF-8. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -# double-quotes, unless you are using Doxywizard) that should identify the -# project for which the documentation is generated. This name is used in the -# title of most generated pages and in a few other places. -# The default value is: My Project. - -PROJECT_NAME = "libtorio" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -# could be handy for archiving the generated documentation or if some version -# control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a -# quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = - -# With the PROJECT_LOGO tag one can specify a logo or an icon that is included -# in the documentation. The maximum height of the logo should not exceed 55 -# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy -# the logo to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -# into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If -# left blank the current directory will be used. - -OUTPUT_DIRECTORY = source/cpp - -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096 -# sub-directories (in 2 levels) under the output directory of each output format -# and will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to -# control the number of sub-directories. -# The default value is: NO. - -CREATE_SUBDIRS = NO - -# Controls the number of sub-directories that will be created when -# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every -# level increment doubles the number of directories, resulting in 4096 -# directories at level 8 which is the default and also the maximum value. The -# sub-directories are organized in 2 levels, the first level always has a fixed -# numer of 16 directories. -# Minimum value: 0, maximum value: 8, default value: 8. -# This tag requires that the tag CREATE_SUBDIRS is set to YES. - -CREATE_SUBDIRS_LEVEL = 8 - -# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII -# characters to appear in the names of generated files. If set to NO, non-ASCII -# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode -# U+3044. -# The default value is: NO. - -ALLOW_UNICODE_NAMES = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian, -# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English -# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek, -# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with -# English messages), Korean, Korean-en (Korean with English messages), Latvian, -# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, -# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, -# Swedish, Turkish, Ukrainian and Vietnamese. -# The default value is: English. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member -# descriptions after the members that are listed in the file and class -# documentation (similar to Javadoc). Set to NO to disable this. -# The default value is: YES. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief -# description of a member or function before the detailed description -# -# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. -# The default value is: YES. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator that is -# used to form the text in various listings. Each string in this list, if found -# as the leading text of the brief description, will be stripped from the text -# and the result, after processing the whole list, is used as the annotated -# text. Otherwise, the brief description is used as-is. If left blank, the -# following values are used ($name is automatically replaced with the name of -# the entity):The $name class, The $name widget, The $name file, is, provides, -# specifies, contains, represents, a, an and the. - -ABBREVIATE_BRIEF = "The $name class" \ - "The $name widget" \ - "The $name file" \ - is \ - provides \ - specifies \ - contains \ - represents \ - a \ - an \ - the - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief -# description. -# The default value is: NO. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. -# The default value is: NO. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path -# before files name in the file list and in the header files. If set to NO the -# shortest path that makes the file name unique will be used -# The default value is: YES. - -FULL_PATH_NAMES = YES - -# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -# Stripping is only done if one of the specified strings matches the left-hand -# part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to -# strip. -# -# Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. -# This tag requires that the tag FULL_PATH_NAMES is set to YES. - -STRIP_FROM_PATH = - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -# path mentioned in the documentation of a class, which tells the reader which -# header file to include in order to use a class. If left blank only the name of -# the header file containing the class definition is used. Otherwise one should -# specify the list of include paths that are normally passed to the compiler -# using the -I flag. - -STRIP_FROM_INC_PATH = - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't -# support long names like on DOS, Mac, or CD-ROM. -# The default value is: NO. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) -# The default value is: NO. - -JAVADOC_AUTOBRIEF = NO - -# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line -# such as -# /*************** -# as being the beginning of a Javadoc-style comment "banner". If set to NO, the -# Javadoc-style will behave just like regular comments and it will not be -# interpreted by doxygen. -# The default value is: NO. - -JAVADOC_BANNER = NO - -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) -# The default value is: NO. - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -# a brief description. This used to be the default behavior. The new default is -# to treat a multi-line C++ comment block as a detailed description. Set this -# tag to YES if you prefer the old behavior instead. -# -# Note that setting this tag to YES also means that rational rose comments are -# not recognized any more. -# The default value is: NO. - -MULTILINE_CPP_IS_BRIEF = NO - -# By default Python docstrings are displayed as preformatted text and doxygen's -# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the -# doxygen's special commands can be used and the contents of the docstring -# documentation blocks is shown as doxygen documentation. -# The default value is: YES. - -PYTHON_DOCSTRING = YES - -# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -# documentation from any documented member that it re-implements. -# The default value is: YES. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new -# page for each member. If set to NO, the documentation of a member will be part -# of the file/class/namespace that contains it. -# The default value is: NO. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -# uses this value to replace tabs by spaces in code fragments. -# Minimum value: 1, maximum value: 16, default value: 4. - -TAB_SIZE = 4 - -# This tag can be used to specify a number of aliases that act as commands in -# the documentation. An alias has the form: -# name=value -# For example adding -# "sideeffect=@par Side Effects:^^" -# will allow you to put the command \sideeffect (or @sideeffect) in the -# documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". Note that you cannot put \n's in the value part of an alias -# to insert newlines (in the resulting output). You can put ^^ in the value part -# of an alias to insert a newline as if a physical newline was in the original -# file. When you need a literal { or } or , in the value part of an alias you -# have to escape them by means of a backslash (\), this can lead to conflicts -# with the commands \{ and \} for these it is advised to use the version @{ and -# @} or use a double escape (\\{ and \\}) - -ALIASES = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -# only. Doxygen will then generate output that is more tailored for C. For -# instance, some of the names that are used will be different. The list of all -# members will be omitted, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -# Python sources only. Doxygen will then generate output that is more tailored -# for that language. For instance, namespaces will be presented as packages, -# qualified scopes will look different, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources. Doxygen will then generate output that is tailored for Fortran. -# The default value is: NO. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for VHDL. -# The default value is: NO. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice -# sources only. Doxygen will then generate output that is more tailored for that -# language. For instance, namespaces will be presented as modules, types will be -# separated into more groups, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_SLICE = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, -# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, -# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: -# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser -# tries to guess whether the code is fixed or free formatted code, this is the -# default for Fortran type files). For instance to make doxygen treat .inc files -# as Fortran files (default is PHP), and .f files as C (default is Fortran), -# use: inc=Fortran f=C. -# -# Note: For files without extension you can use no_extension as a placeholder. -# -# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. When specifying no_extension you should add -# * to the FILE_PATTERNS. -# -# Note see also the list of default file extension mappings. - -EXTENSION_MAPPING = - -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -# according to the Markdown format, which allows for more readable -# documentation. See https://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -# case of backward compatibilities issues. -# The default value is: YES. - -MARKDOWN_SUPPORT = YES - -# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up -# to that level are automatically included in the table of contents, even if -# they do not have an id attribute. -# Note: This feature currently applies only to Markdown headings. -# Minimum value: 0, maximum value: 99, default value: 5. -# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. - -TOC_INCLUDE_HEADINGS = 5 - -# When enabled doxygen tries to link words that correspond to documented -# classes, or namespaces to their corresponding documentation. Such a link can -# be prevented in individual cases by putting a % sign in front of the word or -# globally by setting AUTOLINK_SUPPORT to NO. -# The default value is: YES. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. -# The default value is: NO. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. -# The default value is: NO. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. -# The default value is: NO. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate -# getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. -# This will only work if the methods are indeed getting or setting a simple -# type. If this is not the case, or you want to show the methods anyway, you -# should set this option to NO. -# The default value is: YES. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. -# The default value is: NO. - -DISTRIBUTE_GROUP_DOC = NO - -# If one adds a struct or class to a group and this option is enabled, then also -# any nested class or struct is added to the same group. By default this option -# is disabled and one has to add nested compounds explicitly via \ingroup. -# The default value is: NO. - -GROUP_NESTED_COMPOUNDS = NO - -# Set the SUBGROUPING tag to YES to allow class member groups of the same type -# (for instance a group of public functions) to be put as a subgroup of that -# type (e.g. under the Public Functions section). Set it to NO to prevent -# subgrouping. Alternatively, this can be done per class using the -# \nosubgrouping command. -# The default value is: YES. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -# are shown inside the group in which they are included (e.g. using \ingroup) -# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -# and RTF). -# -# Note that this feature does not work in combination with -# SEPARATE_MEMBER_PAGES. -# The default value is: NO. - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -# with only public data fields or simple typedef fields will be shown inline in -# the documentation of the scope in which they are defined (i.e. file, -# namespace, or group documentation), provided this scope is documented. If set -# to NO, structs, classes, and unions are shown on a separate page (for HTML and -# Man pages) or section (for LaTeX and RTF). -# The default value is: NO. - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -# enum is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically be -# useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. -# The default value is: NO. - -TYPEDEF_HIDES_STRUCT = NO - -# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -# cache is used to resolve symbols given their name and scope. Since this can be -# an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The -# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest -# the optimal cache size from a speed point of view. -# Minimum value: 0, maximum value: 9, default value: 0. - -LOOKUP_CACHE_SIZE = 0 - -# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use -# during processing. When set to 0 doxygen will based this on the number of -# cores available in the system. You can set it explicitly to a value larger -# than 0 to get more control over the balance between CPU load and processing -# speed. At this moment only the input processing can be done using multiple -# threads. Since this is still an experimental feature the default is set to 1, -# which effectively disables parallel processing. Please report any issues you -# encounter. Generating dot graphs in parallel is controlled by the -# DOT_NUM_THREADS setting. -# Minimum value: 0, maximum value: 32, default value: 1. - -NUM_PROC_THREADS = 1 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in -# documentation are documented, even if no documentation was available. Private -# class members and static file members will be hidden unless the -# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -# Note: This will also disable the warnings about undocumented members that are -# normally produced when WARNINGS is set to YES. -# The default value is: NO. - -EXTRACT_ALL = NO - -# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will -# be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIVATE = NO - -# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual -# methods of a class will be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIV_VIRTUAL = NO - -# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal -# scope will be included in the documentation. -# The default value is: NO. - -EXTRACT_PACKAGE = NO - -# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be -# included in the documentation. -# The default value is: NO. - -EXTRACT_STATIC = NO - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined -# locally in source files will be included in the documentation. If set to NO, -# only classes defined in header files are included. Does not have any effect -# for Java sources. -# The default value is: YES. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. If set to YES, local methods, -# which are defined in the implementation section but not in the interface are -# included in the documentation. If set to NO, only methods in the interface are -# included. -# The default value is: NO. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base name of -# the file that contains the anonymous namespace. By default anonymous namespace -# are hidden. -# The default value is: NO. - -EXTRACT_ANON_NSPACES = NO - -# If this flag is set to YES, the name of an unnamed parameter in a declaration -# will be determined by the corresponding definition. By default unnamed -# parameters remain unnamed in the output. -# The default value is: YES. - -RESOLVE_UNNAMED_PARAMS = YES - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -# undocumented members inside documented classes or files. If set to NO these -# members will be included in the various overviews, but no documentation -# section is generated. This option has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. If set -# to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# declarations. If set to NO, these declarations will be included in the -# documentation. -# The default value is: NO. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -# documentation blocks found inside the body of a function. If set to NO, these -# blocks will be appended to the function's detailed documentation block. -# The default value is: NO. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation that is typed after a -# \internal command is included. If the tag is set to NO then the documentation -# will be excluded. Set it to YES to include the internal documentation. -# The default value is: NO. - -INTERNAL_DOCS = NO - -# With the correct setting of option CASE_SENSE_NAMES doxygen will better be -# able to match the capabilities of the underlying filesystem. In case the -# filesystem is case sensitive (i.e. it supports files in the same directory -# whose names only differ in casing), the option must be set to YES to properly -# deal with such files in case they appear in the input. For filesystems that -# are not case sensitive the option should be set to NO to properly deal with -# output files written for symbols that only differ in casing, such as for two -# classes, one named CLASS and the other named Class, and to also support -# references to files without having to specify the exact matching casing. On -# Windows (including Cygwin) and MacOS, users should typically set this option -# to NO, whereas on Linux or other Unix flavors it should typically be set to -# YES. -# Possible values are: SYSTEM, NO and YES. -# The default value is: SYSTEM. - -CASE_SENSE_NAMES = SYSTEM - -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -# their full class and namespace scopes in the documentation. If set to YES, the -# scope will be hidden. -# The default value is: NO. - -HIDE_SCOPE_NAMES = NO - -# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will -# append additional text to a page's title, such as Class Reference. If set to -# YES the compound reference will be hidden. -# The default value is: NO. - -HIDE_COMPOUND_REFERENCE= NO - -# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class -# will show which file needs to be included to use the class. -# The default value is: YES. - -SHOW_HEADERFILE = YES - -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -# the files that are included by a file in the documentation of that file. -# The default value is: YES. - -SHOW_INCLUDE_FILES = YES - -# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each -# grouped member an include statement to the documentation, telling the reader -# which file to include in order to use the member. -# The default value is: NO. - -SHOW_GROUPED_MEMB_INC = NO - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -# files with double quotes in the documentation rather than with sharp brackets. -# The default value is: NO. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -# documentation for inline members. -# The default value is: YES. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -# (detailed) documentation of file and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. -# The default value is: YES. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -# descriptions of file, namespace and class members alphabetically by member -# name. If set to NO, the members will appear in declaration order. Note that -# this will also influence the order of the classes in the class list. -# The default value is: NO. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -# (brief and detailed) documentation of class members so that constructors and -# destructors are listed first. If set to NO the constructors will appear in the -# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -# member documentation. -# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -# detailed member documentation. -# The default value is: NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -# of group names into alphabetical order. If set to NO the group names will -# appear in their defined order. -# The default value is: NO. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -# fully-qualified names, including namespaces. If set to NO, the class list will -# be sorted only by class name, not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the alphabetical -# list. -# The default value is: NO. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -# type resolution of all parameters of a function it will reject a match between -# the prototype and the implementation of a member function even if there is -# only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -# accept a match between prototype and implementation in such cases. -# The default value is: NO. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo -# list. This list is created by putting \todo commands in the documentation. -# The default value is: YES. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test -# list. This list is created by putting \test commands in the documentation. -# The default value is: YES. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug -# list. This list is created by putting \bug commands in the documentation. -# The default value is: YES. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) -# the deprecated list. This list is created by putting \deprecated commands in -# the documentation. -# The default value is: YES. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional documentation -# sections, marked by \if ... \endif and \cond -# ... \endcond blocks. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -# initial value of a variable or macro / define can have for it to appear in the -# documentation. If the initializer consists of more lines than specified here -# it will be hidden. Use a value of 0 to hide initializers completely. The -# appearance of the value of individual variables and macros / defines can be -# controlled using \showinitializer or \hideinitializer command in the -# documentation regardless of this setting. -# Minimum value: 0, maximum value: 10000, default value: 30. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -# the bottom of the documentation of classes and structs. If set to YES, the -# list will mention the files that were used to generate the documentation. -# The default value is: YES. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -# will remove the Files entry from the Quick Index and from the Folder Tree View -# (if specified). -# The default value is: YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -# page. This will remove the Namespaces entry from the Quick Index and from the -# Folder Tree View (if specified). -# The default value is: YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command command input-file, where command is the value of the -# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file -# version. For an example see the documentation. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can -# optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. See also section "Changing the -# layout of pages" for information. -# -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -# tag is left empty. - -LAYOUT_FILE = - -# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -# the reference definitions. This must be a list of .bib files. The .bib -# extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. -# For LaTeX the style of the bibliography can be controlled using -# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -# search path. See also \cite for info how to create references. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# Configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the -# messages are off. -# The default value is: NO. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES -# this implies that the warnings are on. -# -# Tip: Turn warnings on while writing the documentation. -# The default value is: YES. - -WARNINGS = YES - -# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate -# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -# will automatically be disabled. -# The default value is: YES. - -WARN_IF_UNDOCUMENTED = YES - -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as documenting some parameters in -# a documented function twice, or documenting parameters that don't exist or -# using markup commands wrongly. -# The default value is: YES. - -WARN_IF_DOC_ERROR = YES - -# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete -# function parameter documentation. If set to NO, doxygen will accept that some -# parameters have no documentation without warning. -# The default value is: YES. - -WARN_IF_INCOMPLETE_DOC = YES - -# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -# are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong parameter -# documentation, but not about the absence of documentation. If EXTRACT_ALL is -# set to YES then this flag will automatically be disabled. See also -# WARN_IF_INCOMPLETE_DOC -# The default value is: NO. - -WARN_NO_PARAMDOC = NO - -# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS -# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but -# at the end of the doxygen process doxygen will return with a non-zero status. -# Possible values are: NO, YES and FAIL_ON_WARNINGS. -# The default value is: NO. - -WARN_AS_ERROR = NO - -# The WARN_FORMAT tag determines the format of the warning messages that doxygen -# can produce. The string should contain the $file, $line, and $text tags, which -# will be replaced by the file and line number from which the warning originated -# and the warning text. Optionally the format may contain $version, which will -# be replaced by the version of the file (if it could be obtained via -# FILE_VERSION_FILTER) -# See also: WARN_LINE_FORMAT -# The default value is: $file:$line: $text. - -WARN_FORMAT = "$file:$line: $text" - -# In the $text part of the WARN_FORMAT command it is possible that a reference -# to a more specific place is given. To make it easier to jump to this place -# (outside of doxygen) the user can define a custom "cut" / "paste" string. -# Example: -# WARN_LINE_FORMAT = "'vi $file +$line'" -# See also: WARN_FORMAT -# The default value is: at line $line of file $file. - -WARN_LINE_FORMAT = "at line $line of file $file" - -# The WARN_LOGFILE tag can be used to specify a file to which warning and error -# messages should be written. If left blank the output is written to standard -# error (stderr). In case the file specified cannot be opened for writing the -# warning and error messages are written to standard error. When as file - is -# specified the warning and error messages are written to standard output -# (stdout). - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# Configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag is used to specify the files and/or directories that contain -# documented source files. You may enter file names like myfile.cpp or -# directories like /usr/src/myproject. Separate the files or directories with -# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING -# Note: If this tag is empty the current directory is searched. - -INPUT = ../src/libtorio/ffmpeg/stream_reader/typedefs.h \ - ../src/libtorio/ffmpeg/stream_reader/stream_reader.h \ - ../src/libtorio/ffmpeg/stream_writer/stream_writer.h - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: -# https://www.gnu.org/software/libiconv/) for the list of possible encodings. -# See also: INPUT_FILE_ENCODING -# The default value is: UTF-8. - -INPUT_ENCODING = UTF-8 - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify -# character encoding on a per file pattern basis. Doxygen will compare the file -# name with each pattern and apply the encoding instead of the default -# INPUT_ENCODING) if there is a match. The character encodings are a list of the -# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding -# "INPUT_ENCODING" for further information on supported encodings. - -INPUT_FILE_ENCODING = - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -# *.h) to filter out the source-files in the directories. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# read by doxygen. -# -# Note the list of default checked file patterns might differ from the list of -# default file extension mappings. -# -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, -# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C -# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, -# *.vhdl, *.ucf, *.qsf and *.ice. - -FILE_PATTERNS = *.c \ - *.cc \ - *.cxx \ - *.cpp \ - *.c++ \ - *.java \ - *.ii \ - *.ixx \ - *.ipp \ - *.i++ \ - *.inl \ - *.idl \ - *.ddl \ - *.odl \ - *.h \ - *.hh \ - *.hxx \ - *.hpp \ - *.h++ \ - *.l \ - *.cs \ - *.d \ - *.php \ - *.php4 \ - *.php5 \ - *.phtml \ - *.inc \ - *.m \ - *.markdown \ - *.md \ - *.mm \ - *.dox \ - *.py \ - *.pyw \ - *.f90 \ - *.f95 \ - *.f03 \ - *.f08 \ - *.f18 \ - *.f \ - *.for \ - *.vhd \ - *.vhdl \ - *.ucf \ - *.qsf \ - *.ice - -# The RECURSIVE tag can be used to specify whether or not subdirectories should -# be searched for input files as well. -# The default value is: NO. - -RECURSIVE = NO - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. -# The default value is: NO. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# ANamespace::AClass, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or directories -# that contain example code fragments that are included (see the \include -# command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank all -# files are included. - -EXAMPLE_PATTERNS = * - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude commands -# irrespective of the value of the RECURSIVE tag. -# The default value is: NO. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or directories -# that contain images that are to be included in the documentation (see the -# \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command: -# -# -# -# where is the value of the INPUT_FILTER tag, and is the -# name of an input file. Doxygen will then use the output that the filter -# program writes to standard output. If FILTER_PATTERNS is specified, this tag -# will be ignored. -# -# Note that the filter must not add or remove lines; it is applied before the -# code is scanned, but not when the output code is generated. If lines are added -# or removed, the anchors will not be placed correctly. -# -# Note that doxygen will use the data processed and written to standard output -# for further processing, therefore nothing else, like debug statements or used -# commands (so in case of a Windows batch file always use @echo OFF), should be -# written to standard output. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: pattern=filter -# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -# patterns match the file name, INPUT_FILTER is applied. -# -# Note that for custom extensions or not directly supported extensions you also -# need to set EXTENSION_MAPPING for the extension otherwise the files are not -# properly processed by doxygen. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will also be used to filter the input files that are used for -# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -# The default value is: NO. - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -# it is also possible to disable source filtering for a specific pattern using -# *.ext= (so without naming a filter). -# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - -FILTER_SOURCE_PATTERNS = - -# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -# is part of the input, its contents will be placed on the main page -# (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. - -USE_MDFILE_AS_MAINPAGE = - -# The Fortran standard specifies that for fixed formatted Fortran code all -# characters from position 72 are to be considered as comment. A common -# extension is to allow longer lines before the automatic comment starts. The -# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can -# be processed before the automatic comment starts. -# Minimum value: 7, maximum value: 10000, default value: 72. - -FORTRAN_COMMENT_AFTER = 72 - -#--------------------------------------------------------------------------- -# Configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -# generated. Documented entities will be cross-referenced with these sources. -# -# Note: To get rid of all source code in the generated output, make sure that -# also VERBATIM_HEADERS is set to NO. -# The default value is: NO. - -SOURCE_BROWSER = NO - -# Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. -# The default value is: NO. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -# special comment blocks from generated source code fragments. Normal C, C++ and -# Fortran comments will always remain visible. -# The default value is: YES. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# entity all documented functions referencing it will be listed. -# The default value is: NO. - -REFERENCED_BY_RELATION = NO - -# If the REFERENCES_RELATION tag is set to YES then for each documented function -# all documented entities called/used by that function will be listed. -# The default value is: NO. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -# to YES then the hyperlinks from functions in REFERENCES_RELATION and -# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -# link to the documentation. -# The default value is: YES. - -REFERENCES_LINK_SOURCE = YES - -# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -# source code will show a tooltip with additional information such as prototype, -# brief description and links to the definition and documentation. Since this -# will make the HTML file larger and loading of large files a bit slower, you -# can opt to disable this feature. -# The default value is: YES. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -SOURCE_TOOLTIPS = YES - -# If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in -# source browser. The htags tool is part of GNU's global source tagging system -# (see https://www.gnu.org/software/global/global.html). You will need version -# 4.8.6 or higher. -# -# To use it do the following: -# - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file -# - Make sure the INPUT points to the root of the source tree -# - Run doxygen as normal -# -# Doxygen will invoke htags (and that will in turn invoke gtags), so these -# tools must be available from the command line (i.e. in the search path). -# -# The result: instead of the source browser generated by doxygen, the links to -# source code will now point to the output of htags. -# The default value is: NO. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -# verbatim copy of the header file for each class for which an include is -# specified. Set to NO to disable this. -# See also: Section \class. -# The default value is: YES. - -VERBATIM_HEADERS = YES - -#--------------------------------------------------------------------------- -# Configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -# compounds will be generated. Enable this if the project contains a lot of -# classes, structs, unions or interfaces. -# The default value is: YES. - -ALPHABETICAL_INDEX = YES - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output -# The default value is: YES. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -# it. -# The default directory is: html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -# generated HTML page (for example: .htm, .php, .asp). -# The default value is: .html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a -# standard header. -# -# To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. -# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -# default header using -# doxygen -w html new_header.html new_footer.html new_stylesheet.css -# YourConfigFile -# and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally -# uses. -# Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description -# of the possible markers and block names see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard -# footer. See HTML_HEADER for more information on how to generate a default -# footer and what special commands can be used inside the footer. See also -# section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -# sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. -# See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. -# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -# it is more robust and this tag (HTML_STYLESHEET) will in the future become -# obsolete. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined -# cascading style sheets that are included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. -# This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefore more robust against future updates. -# Doxygen will copy the style sheet files to the output directory. -# Note: The order of the extra style sheet files is of importance (e.g. the last -# style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -# files will be copied as-is; there are no commands or markers available. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output -# should be rendered with a dark or light theme. Default setting AUTO_LIGHT -# enables light output unless the user preference is dark output. Other options -# are DARK to always use dark mode, LIGHT to always use light mode, AUTO_DARK to -# default to dark mode unless the user prefers light mode, and TOGGLE to let the -# user toggle between dark and light mode via a button. -# Possible values are: LIGHT Always generate light output., DARK Always generate -# dark output., AUTO_LIGHT Automatically set the mode according to the user -# preference, use light mode if no preference is set (the default)., AUTO_DARK -# Automatically set the mode according to the user preference, use dark mode if -# no preference is set. and TOGGLE Allow to user to switch between light and -# dark mode via a button.. -# The default value is: AUTO_LIGHT. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE = AUTO_LIGHT - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -# will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a color-wheel, see -# https://en.wikipedia.org/wiki/Hue for more information. For instance the value -# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -# purple, and 360 is red again. -# Minimum value: 0, maximum value: 359, default value: 220. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use gray-scales only. A -# value of 255 will produce the most vivid colors. -# Minimum value: 0, maximum value: 255, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -# luminance component of the colors in the HTML output. Values below 100 -# gradually make the output lighter, whereas values above 100 make the output -# darker. The value divided by 100 is the actual gamma applied, so 80 represents -# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -# change the gamma. -# Minimum value: 40, maximum value: 240, default value: 80. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML -# documentation will contain a main index with vertical navigation menus that -# are dynamically created via JavaScript. If disabled, the navigation index will -# consists of multiple levels of tabs that are statically embedded in every HTML -# page. Disable this option to support browsers that do not have JavaScript, -# like the Qt help browser. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_MENUS = YES - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -# shown in the various tree structured indices initially; the user can expand -# and collapse entries dynamically later on. Doxygen will expand the tree to -# such a level that at most the specified number of entries are visible (unless -# a fully collapsed tree already exceeds this amount). So setting the number of -# entries 1 will produce a full collapsed tree by default. 0 is a special value -# representing an infinite number of entries and will result in a full expanded -# tree by default. -# Minimum value: 0, maximum value: 9999, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files will be -# generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: -# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To -# create a documentation set, doxygen will generate a Makefile in the HTML -# output directory. Running make will produce the docset in that directory and -# running make install will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy -# genXcode/_index.html for more information. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_DOCSET = NO - -# This tag determines the name of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# The default value is: Doxygen generated docs. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# This tag determines the URL of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDURL = - -# This tag specifies a string that should uniquely identify the documentation -# set bundle. This should be a reverse domain-name style string, e.g. -# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. -# The default value is: org.doxygen.Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -# The default value is: Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# on Windows. In the beginning of 2021 Microsoft took the original page, with -# a.o. the download links, offline the HTML help workshop was already many years -# in maintenance mode). You can download the HTML help workshop from the web -# archives at Installation executable (see: -# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo -# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). -# -# The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -# files are now used as the Windows 98 help format, and will replace the old -# Windows help format (.hlp) on all Windows platforms in the future. Compressed -# HTML files also contain an index, a table of contents, and you can search for -# words in the documentation. The HTML workshop also contains a viewer for -# compressed HTML files. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_HTMLHELP = NO - -# The CHM_FILE tag can be used to specify the file name of the resulting .chm -# file. You can add a path in front of the file if the result should not be -# written to the html output directory. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_FILE = - -# The HHC_LOCATION tag can be used to specify the location (absolute path -# including file name) of the HTML help compiler (hhc.exe). If non-empty, -# doxygen will try to run the HTML help compiler on the generated index.hhp. -# The file has to be specified with full path. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -HHC_LOCATION = - -# The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the main .chm file (NO). -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -GENERATE_CHI = NO - -# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) -# and project file content. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_INDEX_ENCODING = - -# The BINARY_TOC flag controls whether a binary table of contents is generated -# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it -# enables the Previous and Next buttons. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members to -# the table of contents of the HTML help documentation and to the tree view. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -# (.qch) of the generated HTML documentation. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -# the file name of the resulting .qch file. The path specified is relative to -# the HTML output folder. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -# Project output. For more information please see Qt Help Project / Namespace -# (see: -# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -# Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: -# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). -# The default value is: doc. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_VIRTUAL_FOLDER = doc - -# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -# filter to add. For more information please see Qt Help Project / Custom -# Filters (see: -# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: -# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's filter section matches. Qt Help Project / Filter Attributes (see: -# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_SECT_FILTER_ATTRS = - -# The QHG_LOCATION tag can be used to specify the location (absolute path -# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to -# run qhelpgenerator on the generated .qhp file. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -# generated, together with the HTML files, they form an Eclipse help plugin. To -# install this plugin and make it available under the help contents menu in -# Eclipse, the contents of the directory containing the HTML and XML files needs -# to be copied into the plugins directory of eclipse. The name of the directory -# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -# After copying Eclipse needs to be restarted before the help appears. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the Eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have this -# name. Each documentation set should have its own identifier. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# If you want full control over the layout of the generated HTML pages it might -# be necessary to disable the index and replace it with your own. The -# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -# of each HTML page. A value of NO enables the index and the value YES disables -# it. Since the tabs in the index contain the same information as the navigation -# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. If the tag -# value is set to YES, a side panel will be generated containing a tree-like -# index structure (just like the one that is generated for HTML Help). For this -# to work a browser that supports JavaScript, DHTML, CSS and frames is required -# (i.e. any modern browser). Windows users are probably better off using the -# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine tune the look of the index (see "Fine-tuning the output"). As an -# example, the default style sheet generated by doxygen has an example that -# shows how to put an image at the root of the tree instead of the PROJECT_NAME. -# Since the tree basically has the same information as the tab index, you could -# consider setting DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_TREEVIEW = NO - -# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the -# FULL_SIDEBAR option determines if the side bar is limited to only the treeview -# area (value NO) or if it should extend to the full height of the window (value -# YES). Setting this to YES gives a layout similar to -# https://docs.readthedocs.io with more room for contents, but less room for the -# project logo, title, and description. If either GENERATE_TREEVIEW or -# DISABLE_INDEX is set to NO, this option has no effect. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FULL_SIDEBAR = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. -# -# Note that a value of 0 will completely suppress the enum values from appearing -# in the overview section. -# Minimum value: 0, maximum value: 20, default value: 4. -# This tag requires that the tag GENERATE_HTML is set to YES. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -# to set the initial width (in pixels) of the frame in which the tree is shown. -# Minimum value: 0, maximum value: 1500, default value: 250. -# This tag requires that the tag GENERATE_HTML is set to YES. - -TREEVIEW_WIDTH = 250 - -# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to -# external symbols imported via tag files in a separate window. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -EXT_LINKS_IN_WINDOW = NO - -# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email -# addresses. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -OBFUSCATE_EMAILS = YES - -# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg -# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see -# https://inkscape.org) to generate formulas as SVG images instead of PNGs for -# the HTML output. These images will generally look nicer at scaled resolutions. -# Possible values are: png (the default) and svg (looks nicer but requires the -# pdf2svg or inkscape tool). -# The default value is: png. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FORMULA_FORMAT = png - -# Use this tag to change the font size of LaTeX formulas included as images in -# the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML -# output directory to force them to be regenerated. -# Minimum value: 8, maximum value: 50, default value: 10. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_FONTSIZE = 10 - -# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands -# to create new LaTeX commands to be used in formulas as building blocks. See -# the section "Including formulas" for details. - -FORMULA_MACROFILE = - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# https://www.mathjax.org) which uses client side JavaScript for the rendering -# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX -# installed or if you want to formulas look prettier in the HTML output. When -# enabled you may also need to install MathJax separately and configure the path -# to it using the MATHJAX_RELPATH option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -USE_MATHJAX = NO - -# With MATHJAX_VERSION it is possible to specify the MathJax version to be used. -# Note that the different versions of MathJax have different requirements with -# regards to the different settings, so it is possible that also other MathJax -# settings have to be changed when switching between the different MathJax -# versions. -# Possible values are: MathJax_2 and MathJax_3. -# The default value is: MathJax_2. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_VERSION = MathJax_2 - -# When MathJax is enabled you can set the default output format to be used for -# the MathJax output. For more details about the output format see MathJax -# version 2 (see: -# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 -# (see: -# http://docs.mathjax.org/en/latest/web/components/output.html). -# Possible values are: HTML-CSS (which is slower, but has the best -# compatibility. This is the name for Mathjax version 2, for MathJax version 3 -# this will be translated into chtml), NativeMML (i.e. MathML. Only supported -# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This -# is the name for Mathjax version 3, for MathJax version 2 this will be -# translated into HTML-CSS) and SVG. -# The default value is: HTML-CSS. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_FORMAT = HTML-CSS - -# When MathJax is enabled you need to specify the location relative to the HTML -# output directory using the MATHJAX_RELPATH option. The destination directory -# should contain the MathJax.js script. For instance, if the mathjax directory -# is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -# Content Delivery Network so you can quickly see the result without installing -# MathJax. However, it is strongly recommended to install a local copy of -# MathJax from https://www.mathjax.org before deployment. The default value is: -# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 -# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_RELPATH = - -# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -# extension names that should be enabled during MathJax rendering. For example -# for MathJax version 2 (see -# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions): -# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -# For example for MathJax version 3 (see -# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): -# MATHJAX_EXTENSIONS = ams -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_EXTENSIONS = - -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -# of code that will be used on startup of the MathJax code. See the MathJax site -# (see: -# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an -# example see the documentation. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_CODEFILE = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and -# should work on any modern browser. Note that when using HTML help -# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -# there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then -# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -# search using the keyboard; to jump to the search box use + S -# (what the is depends on the OS and browser, but it is typically -# , /