Skip to content

[docs] descriptive comments of the decoder C++ api #3754

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions torchvision/csrc/io/decoder/audio_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ int AudioStream::initFormat() {
: -1;
}

// copies audio sample bytes via swr_convert call in audio_sampler.cpp
int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
if (!sampler_) {
sampler_ = std::make_unique<AudioSampler>(codecCtx_);
Expand Down Expand Up @@ -95,6 +96,8 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
<< ", channels: " << format_.format.audio.channels
<< ", format: " << format_.format.audio.format;
}
// calls to a sampler that converts the audio samples and copies them to the
// out buffer via ffmpeg::swr_convert
return sampler_->sample(flush ? nullptr : frame_, out);
}

Expand Down
18 changes: 17 additions & 1 deletion torchvision/csrc/io/decoder/decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,12 @@ Decoder::~Decoder() {
cleanUp();
}

// Initialise the format context that holds information about the container and
// fill it with minimal information about the format (codecs are not opened
// here). Function reads in information about the streams from the container
// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
// specified within the decoder parameters, it seeks into the correct frame
// (note, the seek defined here is "precise" seek).
bool Decoder::init(
const DecoderParameters& params,
DecoderInCallback&& in,
Expand Down Expand Up @@ -384,7 +390,7 @@ bool Decoder::init(
cleanUp();
return false;
}

// SyncDecoder inherits Decoder which would override onInit.
onInit();

if (params.startOffset != 0) {
Expand All @@ -399,6 +405,8 @@ bool Decoder::init(
return true;
}

// open appropriate CODEC for every type of stream and move it to the class
// variable `streams_` and make sure it is in range for decoding
bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
// - find the corespondent format at params_.formats set
Expand Down Expand Up @@ -485,6 +493,10 @@ void Decoder::cleanUp() {
seekableBuffer_.shutdown();
}

// function does actual work, derived class calls it in working thread
// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if
// no frames got decoded in the specified timeout time, and error on
// unrecoverable error.
int Decoder::getFrame(size_t workingTimeInMs) {
if (inRange_.none()) {
return ENODATA;
Expand Down Expand Up @@ -601,11 +613,13 @@ int Decoder::getFrame(size_t workingTimeInMs) {
return 0;
}

// find stream by stream index
Stream* Decoder::findByIndex(int streamIndex) const {
auto it = streams_.find(streamIndex);
return it != streams_.end() ? it->second.get() : nullptr;
}

// find stream by type; note finds only the first stream of a given type
Stream* Decoder::findByType(const MediaFormat& format) const {
for (auto& stream : streams_) {
if (stream.second->getMediaFormat().type == format.type) {
Expand All @@ -615,6 +629,8 @@ Stream* Decoder::findByType(const MediaFormat& format) const {
return nullptr;
}

// given the stream and packet, decode the frame buffers into the
// DecoderOutputMessage data structure via stream::decodePacket function.
int Decoder::processPacket(
Stream* stream,
AVPacket* packet,
Expand Down
14 changes: 14 additions & 0 deletions torchvision/csrc/io/decoder/stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@ Stream::~Stream() {
}
}

// look up the proper CODEC querying the function
AVCodec* Stream::findCodec(AVCodecParameters* params) {
return avcodec_find_decoder(params->codec_id);
}

// Allocate memory for the AVCodecContext, which will hold the context for
// decode/encode process. Then fill this codec context with CODEC parameters
// defined in stream parameters. Open the codec, and allocate the global frame
// defined in the header file
int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
AVStream* steam = inputCtx_->streams[format_.stream];

Expand Down Expand Up @@ -93,6 +98,9 @@ int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
return ret;
}

// send the raw data packet (compressed frame) to the decoder, through the codec
// context and receive the raw data frame (uncompressed frame) from the
// decoder, through the same codec context
int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
int consumed = 0;
int result = avcodec_send_packet(codecCtx_, packet);
Expand Down Expand Up @@ -134,6 +142,9 @@ int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
return consumed;
}

// General decoding function:
// given the packet, analyse the metadata, and write the
// metadata and the buffer to the DecoderOutputImage.
int Stream::decodePacket(
const AVPacket* packet,
DecoderOutputMessage* out,
Expand Down Expand Up @@ -167,6 +178,9 @@ int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
return 1;
}

// Sets the header and payload via stream::setHeader and copyFrameBytes
// functions that are defined in type stream subclass (VideoStream, AudioStream,
// ...)
int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
if (flush) {
// only flush of audio frames makes sense
Expand Down
34 changes: 33 additions & 1 deletion torchvision/csrc/io/decoder/video_sampler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,25 @@
namespace ffmpeg {

namespace {

// Setup the data pointers and linesizes based on the specified image
// parameters and the provided array. This sets up "planes" to point to a
// "buffer"
// NOTE: this is most likely culprit behind #3534
//
// Args:
// fmt: desired output video format
// buffer: source constant image buffer (in different format) that will contain
// the final image after SWScale planes: destination data pointer to be filled
// lineSize: target destination linesize (always {0})
int preparePlanes(
const VideoFormat& fmt,
const uint8_t* buffer,
uint8_t** planes,
int* lineSize) {
int result;

// NOTE: 1 at the end of av_fill_arrays is the value used for alignment
if ((result = av_image_fill_arrays(
planes,
lineSize,
Expand All @@ -28,6 +40,18 @@ int preparePlanes(
return result;
}

// Scale (and crop) the image slice in srcSlice and put the resulting scaled
// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as
// `sws_scale` cannot access buffers directly.
//
// Args:
// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if
// scale) srcSlice: frame data in YUV420P srcStride: the array containing the
// strides for each plane of the source
// image (from AVFrame->linesize[0])
// out: destination buffer
// planes: indirect destination buffer (mapped to "out" via preparePlanes)
// lines: destination linesize; constant {0}
int transformImage(
SwsContext* context,
const uint8_t* const srcSlice[],
Expand All @@ -41,7 +65,7 @@ int transformImage(
if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
return result;
}

// NOTE: srcY stride always 0: this is a parameter of YUV format
if ((result = sws_scale(
context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
0) {
Expand Down Expand Up @@ -153,6 +177,12 @@ bool VideoSampler::init(const SamplerParameters& params) {
return scaleContext_ != nullptr;
}

// Main body of the sample function called from one of the overloads below
//
// Args:
// srcSlice: decoded AVFrame->data perpared buffer
// srcStride: linesize (usually obtained from AVFrame->linesize)
// out: return buffer (ByteStorage*)
int VideoSampler::sample(
const uint8_t* const srcSlice[],
int srcStride[],
Expand Down Expand Up @@ -221,6 +251,7 @@ int VideoSampler::sample(
return outImageSize;
}

// Call from `video_stream.cpp::114` - occurs during file reads
int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
if (!frame) {
return 0; // no flush for videos
Expand All @@ -229,6 +260,7 @@ int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
return sample(frame->data, frame->linesize, out);
}

// Call from `video_stream.cpp::114` - not sure when this occurs
int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
if (!in) {
return 0; // no flush for videos
Expand Down
5 changes: 4 additions & 1 deletion torchvision/csrc/io/decoder/video_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ int VideoStream::initFormat() {
: -1;
}

// copies frame bytes via sws_scale call in video_sampler.cpp
int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
if (!sampler_) {
sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
Expand Down Expand Up @@ -112,7 +113,9 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
<< ", minDimension: " << format_.format.video.minDimension
<< ", crop: " << format_.format.video.cropImage;
}

// calls to a sampler that converts the frame from YUV422 to RGB24, and
// optionally crops and resizes the frame. Frame bytes are copied from
// frame_->data to out buffer
return sampler_->sample(flush ? nullptr : frame_, out);
}

Expand Down