wip

mthrok · mthrok · commit 6c018c85969d · 2023-03-01T23:08:54.000-05:00
diff --git a/torchaudio/csrc/ffmpeg/chunk.h b/torchaudio/csrc/ffmpeg/chunk.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include<torch/types.h>
+
+namespace torchaudio::io {
+
+/// Stores decoded frames and metadata
+struct Chunk {
+  /// Audio/video frames.
+  ///
+  /// For audio, the shape is ``[time, num_channels]``, and the ``dtype``
+  /// depends on output stream configurations.
+  ///
+  /// For video, the shape is ``[time, channel, height, width]``, and
+  /// the ``dtype`` is ``torch.uint8``.
+  torch::Tensor frames;
+  ///
+  /// Presentation time stamp of the first frame, in second.
+  double pts;
+};
+
+} // namespace torchaudio::io
diff --git a/torchaudio/csrc/ffmpeg/pybind/pybind.cpp b/torchaudio/csrc/ffmpeg/pybind/pybind.cpp
@@ -1,4 +1,5 @@
 #include <torch/extension.h>
+#include <torchaudio/csrc/ffmpeg/chunk.h>
 #include <torchaudio/csrc/ffmpeg/pybind/fileobj.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h>
@@ -40,8 +41,18 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def("add_video_stream", &StreamWriter::add_video_stream)
       .def("dump_format", &StreamWriter::dump_format)
       .def("open", &StreamWriter::open)
-      .def("write_audio_chunk", &StreamWriter::write_audio_chunk)
-      .def("write_video_chunk", &StreamWriter::write_video_chunk)
+      .def("write_audio_chunk",
+           py::overload_cast<int, const torch::Tensor&>(
+               &StreamWriter::write_audio_chunk))
+      .def("write_audio_chunk",
+           py::overload_cast<int, const Chunk&>(
+               &StreamWriter::write_audio_chunk))
+      .def("write_video_chunk",
+           py::overload_cast<int, const torch::Tensor&>(
+               &StreamWriter::write_video_chunk))
+      .def("write_video_chunk",
+           py::overload_cast<int, const Chunk&>(
+               &StreamWriter::write_video_chunk))
       .def("flush", &StreamWriter::flush)
       .def("close", &StreamWriter::close);
   py::class_<StreamWriterFileObj>(m, "StreamWriterFileObj", py::module_local())
@@ -51,8 +62,18 @@ PYBIND11_MODULE(_torchaudio_ffmpeg, m) {
       .def("add_video_stream", &StreamWriterFileObj::add_video_stream)
       .def("dump_format", &StreamWriterFileObj::dump_format)
       .def("open", &StreamWriterFileObj::open)
-      .def("write_audio_chunk", &StreamWriterFileObj::write_audio_chunk)
-      .def("write_video_chunk", &StreamWriterFileObj::write_video_chunk)
+      .def("write_audio_chunk",
+           py::overload_cast<int, const torch::Tensor&>(
+               &StreamWriterFileObj::write_audio_chunk))
+      .def("write_audio_chunk",
+           py::overload_cast<int, const Chunk&>(
+               &StreamWriterFileObj::write_audio_chunk))
+      .def("write_video_chunk",
+           py::overload_cast<int, const torch::Tensor&>(
+               &StreamWriterFileObj::write_video_chunk))
+      .def("write_video_chunk",
+           py::overload_cast<int, const Chunk&>(
+               &StreamWriterFileObj::write_video_chunk))
       .def("flush", &StreamWriterFileObj::flush)
       .def("close", &StreamWriterFileObj::close);
   py::class_<OutputStreamInfo>(m, "OutputStreamInfo", py::module_local())
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/buffer.h b/torchaudio/csrc/ffmpeg/stream_reader/buffer.h
@@ -1,5 +1,6 @@
 #pragma once
 #include <torch/torch.h>
+#include <torchaudio/csrc/ffmpeg/chunk.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/typedefs.h>
 
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h b/torchaudio/csrc/ffmpeg/stream_reader/stream_reader.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <torchaudio/csrc/ffmpeg/chunk.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/decoder.h>
 #include <torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h>
diff --git a/torchaudio/csrc/ffmpeg/stream_reader/typedefs.h b/torchaudio/csrc/ffmpeg/stream_reader/typedefs.h
@@ -106,20 +106,5 @@ struct OutputStreamInfo {
   std::string filter_description;
 };
 
-/// Stores decoded frames and metadata
-struct Chunk {
-  /// Audio/video frames.
-  ///
-  /// For audio, the shape is ``[time, num_channels]``, and the ``dtype``
-  /// depends on output stream configurations.
-  ///
-  /// For video, the shape is ``[time, channel, height, width]``, and
-  /// the ``dtype`` is ``torch.uint8``.
-  torch::Tensor frames;
-  ///
-  /// Presentation time stamp of the first frame, in second.
-  double pts;
-};
-
 } // namespace io
 } // namespace torchaudio
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/audio_output_stream.cpp b/torchaudio/csrc/ffmpeg/stream_writer/audio_output_stream.cpp
@@ -42,6 +42,10 @@ AudioOutputStream::AudioOutputStream(
       converter(src_fmt, codec_ctx_),
       codec_ctx(std::move(codec_ctx_)) {}
 
+void AudioOutputStream::write_chunk(const Chunk& chunk) {
+  write_chunk(chunk.frames);
+}
+
 void AudioOutputStream::write_chunk(const torch::Tensor& waveform) {
   AVRational time_base{1, codec_ctx->sample_rate};
   for (const auto& frame : converter.convert(waveform)) {
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/audio_output_stream.h b/torchaudio/csrc/ffmpeg/stream_writer/audio_output_stream.h
@@ -15,6 +15,7 @@ struct AudioOutputStream : OutputStream {
       AVCodecContextPtr&& codec_ctx);
 
   void write_chunk(const torch::Tensor& waveform) override;
+  void write_chunk(const Chunk& chunk) override;
   ~AudioOutputStream() override = default;
 };
 
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/output_stream.h b/torchaudio/csrc/ffmpeg/stream_writer/output_stream.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/types.h>
+#include <torchaudio/csrc/ffmpeg/chunk.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/encoder.h>
@@ -23,6 +24,7 @@ struct OutputStream {
       FilterGraph&& filter);
 
   virtual void write_chunk(const torch::Tensor& input) = 0;
+  virtual void write_chunk(const Chunk& chunk) = 0;
   void process_frame(AVFrame* src);
   void flush();
   virtual ~OutputStream() = default;
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.cpp
@@ -587,11 +587,19 @@ void StreamWriter::write_audio_chunk(int i, const torch::Tensor& waveform) {
   streams[i]->write_chunk(waveform);
 }
 
+void StreamWriter::write_audio_chunk(int i, const Chunk& chunk) {
+  write_audio_chunk(i, chunk.frames);
+}
+
 void StreamWriter::write_video_chunk(int i, const torch::Tensor& frames) {
   validate_stream(i, AVMEDIA_TYPE_VIDEO);
   streams[i]->write_chunk(frames);
 }
 
+void StreamWriter::write_video_chunk(int i, const Chunk& chunk) {
+  write_video_chunk(i, chunk.frames);
+}
+
 void StreamWriter::flush() {
   for (auto& os : streams) {
     os->flush();
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h b/torchaudio/csrc/ffmpeg/stream_writer/stream_writer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/torch.h>
+#include <torchaudio/csrc/ffmpeg/chunk.h>
 #include <torchaudio/csrc/ffmpeg/ffmpeg.h>
 #include <torchaudio/csrc/ffmpeg/filter_graph.h>
 #include <torchaudio/csrc/ffmpeg/stream_writer/output_stream.h>
@@ -161,14 +162,16 @@ class StreamWriter {
   /// @param i Stream index.
   /// @param chunk Waveform tensor. Shape: ``(frame, channel)``.
   /// The ``dtype`` must match what was passed to ``add_audio_stream()`` method.
-  void write_audio_chunk(int i, const torch::Tensor& chunk);
+  void write_audio_chunk(int i, const torch::Tensor& frames);
+  void write_audio_chunk(int i, const Chunk& chunk);
   /// Write video data
   /// @param i Stream index.
   /// @param chunk Video/image tensor. Shape: ``(time, channel, height,
   /// width)``. The ``dtype`` must be ``torch.uint8``. The shape ``(height,
   /// width and the number of channels)`` must match what was configured when
   /// calling ``add_video_stream()``.
-  void write_video_chunk(int i, const torch::Tensor& chunk);
+  void write_video_chunk(int i, const torch::Tensor& frames);
+  void write_video_chunk(int i, const Chunk& chunk);
   /// Flush the frames from encoders and write the frames to the destination.
   void flush();
 
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.cpp
@@ -50,6 +50,10 @@ VideoOutputStream::VideoOutputStream(
       hw_frame_ctx(std::move(hw_frame_ctx_)),
       codec_ctx(std::move(codec_ctx_)) {}
 
+void VideoOutputStream::write_chunk(const Chunk& chunk) {
+  write_chunk(chunk.frames);
+}
+
 void VideoOutputStream::write_chunk(const torch::Tensor& frames) {
   for (const auto& frame : converter.convert(frames)) {
     process_frame(frame);
diff --git a/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h b/torchaudio/csrc/ffmpeg/stream_writer/video_output_stream.h
@@ -18,6 +18,7 @@ struct VideoOutputStream : OutputStream {
       AVBufferRefPtr&& hw_frame_ctx);
 
   void write_chunk(const torch::Tensor& frames) override;
+  void write_chunk(const Chunk& chunk) override;
 
   ~VideoOutputStream() override = default;
 };

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`#pragma once`
	`2`	`+#include <torchaudio/csrc/ffmpeg/chunk.h>`
`2`	`3`	`#include <torchaudio/csrc/ffmpeg/ffmpeg.h>`
`3`	`4`	`#include <torchaudio/csrc/ffmpeg/stream_reader/decoder.h>`
`4`	`5`	`#include <torchaudio/csrc/ffmpeg/stream_reader/stream_processor.h>`