Skip to content

Commit 8e2bd0e

Browse files
bjuncekfmassadatumbox
authored
[docs] descriptive comments of the decoder C++ api (#3754)
* document video_sampler. * minor docs for decoder.cpp * descriptive comments for the stream.c * descriptive comments for decoder.cpp * per-stream descriptive comments * Fixing CLANG hopefully * addressing prabhat's comments * typo I think Co-authored-by: Francisco Massa <[email protected]> Co-authored-by: Vasilis Vryniotis <[email protected]>
1 parent 38175ed commit 8e2bd0e

File tree

5 files changed

+71
-3
lines changed

5 files changed

+71
-3
lines changed

torchvision/csrc/io/decoder/audio_stream.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ int AudioStream::initFormat() {
6868
: -1;
6969
}
7070

71+
// copies audio sample bytes via swr_convert call in audio_sampler.cpp
7172
int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
7273
if (!sampler_) {
7374
sampler_ = std::make_unique<AudioSampler>(codecCtx_);
@@ -95,6 +96,8 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
9596
<< ", channels: " << format_.format.audio.channels
9697
<< ", format: " << format_.format.audio.format;
9798
}
99+
// calls to a sampler that converts the audio samples and copies them to the
100+
// out buffer via ffmpeg::swr_convert
98101
return sampler_->sample(flush ? nullptr : frame_, out);
99102
}
100103

torchvision/csrc/io/decoder/decoder.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,12 @@ Decoder::~Decoder() {
218218
cleanUp();
219219
}
220220

221+
// Initialise the format context that holds information about the container and
222+
// fill it with minimal information about the format (codecs are not opened
223+
// here). Function reads in information about the streams from the container
224+
// into inputCtx and then passes it to decoder::openStreams. Finally, if seek is
225+
// specified within the decoder parameters, it seeks into the correct frame
226+
// (note, the seek defined here is "precise" seek).
221227
bool Decoder::init(
222228
const DecoderParameters& params,
223229
DecoderInCallback&& in,
@@ -384,7 +390,7 @@ bool Decoder::init(
384390
cleanUp();
385391
return false;
386392
}
387-
393+
// SyncDecoder inherits Decoder which would override onInit.
388394
onInit();
389395

390396
if (params.startOffset != 0) {
@@ -399,6 +405,8 @@ bool Decoder::init(
399405
return true;
400406
}
401407

408+
// open appropriate CODEC for every type of stream and move it to the class
409+
// variable `streams_` and make sure it is in range for decoding
402410
bool Decoder::openStreams(std::vector<DecoderMetadata>* metadata) {
403411
for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) {
404412
// - find the corespondent format at params_.formats set
@@ -485,6 +493,10 @@ void Decoder::cleanUp() {
485493
seekableBuffer_.shutdown();
486494
}
487495

496+
// function does actual work, derived class calls it in working thread
497+
// periodically. On success method returns 0, ENODATA on EOF, ETIMEDOUT if
498+
// no frames got decoded in the specified timeout time, and error on
499+
// unrecoverable error.
488500
int Decoder::getFrame(size_t workingTimeInMs) {
489501
if (inRange_.none()) {
490502
return ENODATA;
@@ -601,11 +613,13 @@ int Decoder::getFrame(size_t workingTimeInMs) {
601613
return 0;
602614
}
603615

616+
// find stream by stream index
604617
Stream* Decoder::findByIndex(int streamIndex) const {
605618
auto it = streams_.find(streamIndex);
606619
return it != streams_.end() ? it->second.get() : nullptr;
607620
}
608621

622+
// find stream by type; note finds only the first stream of a given type
609623
Stream* Decoder::findByType(const MediaFormat& format) const {
610624
for (auto& stream : streams_) {
611625
if (stream.second->getMediaFormat().type == format.type) {
@@ -615,6 +629,8 @@ Stream* Decoder::findByType(const MediaFormat& format) const {
615629
return nullptr;
616630
}
617631

632+
// given the stream and packet, decode the frame buffers into the
633+
// DecoderOutputMessage data structure via stream::decodePacket function.
618634
int Decoder::processPacket(
619635
Stream* stream,
620636
AVPacket* packet,

torchvision/csrc/io/decoder/stream.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,15 @@ Stream::~Stream() {
2424
}
2525
}
2626

27+
// look up the proper CODEC querying the function
2728
AVCodec* Stream::findCodec(AVCodecParameters* params) {
2829
return avcodec_find_decoder(params->codec_id);
2930
}
3031

32+
// Allocate memory for the AVCodecContext, which will hold the context for
33+
// decode/encode process. Then fill this codec context with CODEC parameters
34+
// defined in stream parameters. Open the codec, and allocate the global frame
35+
// defined in the header file
3136
int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
3237
AVStream* steam = inputCtx_->streams[format_.stream];
3338

@@ -93,6 +98,9 @@ int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
9398
return ret;
9499
}
95100

101+
// send the raw data packet (compressed frame) to the decoder, through the codec
102+
// context and receive the raw data frame (uncompressed frame) from the
103+
// decoder, through the same codec context
96104
int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
97105
int consumed = 0;
98106
int result = avcodec_send_packet(codecCtx_, packet);
@@ -134,6 +142,9 @@ int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
134142
return consumed;
135143
}
136144

145+
// General decoding function:
146+
// given the packet, analyse the metadata, and write the
147+
// metadata and the buffer to the DecoderOutputImage.
137148
int Stream::decodePacket(
138149
const AVPacket* packet,
139150
DecoderOutputMessage* out,
@@ -167,6 +178,9 @@ int Stream::flush(DecoderOutputMessage* out, bool headerOnly) {
167178
return 1;
168179
}
169180

181+
// Sets the header and payload via stream::setHeader and copyFrameBytes
182+
// functions that are defined in type stream subclass (VideoStream, AudioStream,
183+
// ...)
170184
int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) {
171185
if (flush) {
172186
// only flush of audio frames makes sense

torchvision/csrc/io/decoder/video_sampler.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,25 @@
77
namespace ffmpeg {
88

99
namespace {
10+
11+
// Setup the data pointers and linesizes based on the specified image
12+
// parameters and the provided array. This sets up "planes" to point to a
13+
// "buffer"
14+
// NOTE: this is most likely culprit behind #3534
15+
//
16+
// Args:
17+
// fmt: desired output video format
18+
// buffer: source constant image buffer (in different format) that will contain
19+
// the final image after SWScale planes: destination data pointer to be filled
20+
// lineSize: target destination linesize (always {0})
1021
int preparePlanes(
1122
const VideoFormat& fmt,
1223
const uint8_t* buffer,
1324
uint8_t** planes,
1425
int* lineSize) {
1526
int result;
1627

28+
// NOTE: 1 at the end of av_fill_arrays is the value used for alignment
1729
if ((result = av_image_fill_arrays(
1830
planes,
1931
lineSize,
@@ -28,6 +40,18 @@ int preparePlanes(
2840
return result;
2941
}
3042

43+
// Scale (and crop) the image slice in srcSlice and put the resulting scaled
44+
// slice to `planes` buffer, which is mapped to be `out` via preparePlanes as
45+
// `sws_scale` cannot access buffers directly.
46+
//
47+
// Args:
48+
// context: SWSContext allocated on line 119 (if crop, optional) or 163 (if
49+
// scale) srcSlice: frame data in YUV420P srcStride: the array containing the
50+
// strides for each plane of the source
51+
// image (from AVFrame->linesize[0])
52+
// out: destination buffer
53+
// planes: indirect destination buffer (mapped to "out" via preparePlanes)
54+
// lines: destination linesize; constant {0}
3155
int transformImage(
3256
SwsContext* context,
3357
const uint8_t* const srcSlice[],
@@ -41,7 +65,7 @@ int transformImage(
4165
if ((result = preparePlanes(outFormat, out, planes, lines)) < 0) {
4266
return result;
4367
}
44-
68+
// NOTE: srcY stride always 0: this is a parameter of YUV format
4569
if ((result = sws_scale(
4670
context, srcSlice, srcStride, 0, inFormat.height, planes, lines)) <
4771
0) {
@@ -153,6 +177,12 @@ bool VideoSampler::init(const SamplerParameters& params) {
153177
return scaleContext_ != nullptr;
154178
}
155179

180+
// Main body of the sample function called from one of the overloads below
181+
//
182+
// Args:
183+
// srcSlice: decoded AVFrame->data perpared buffer
184+
// srcStride: linesize (usually obtained from AVFrame->linesize)
185+
// out: return buffer (ByteStorage*)
156186
int VideoSampler::sample(
157187
const uint8_t* const srcSlice[],
158188
int srcStride[],
@@ -221,6 +251,7 @@ int VideoSampler::sample(
221251
return outImageSize;
222252
}
223253

254+
// Call from `video_stream.cpp::114` - occurs during file reads
224255
int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
225256
if (!frame) {
226257
return 0; // no flush for videos
@@ -229,6 +260,7 @@ int VideoSampler::sample(AVFrame* frame, ByteStorage* out) {
229260
return sample(frame->data, frame->linesize, out);
230261
}
231262

263+
// Call from `video_stream.cpp::114` - not sure when this occurs
232264
int VideoSampler::sample(const ByteStorage* in, ByteStorage* out) {
233265
if (!in) {
234266
return 0; // no flush for videos

torchvision/csrc/io/decoder/video_stream.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ int VideoStream::initFormat() {
8282
: -1;
8383
}
8484

85+
// copies frame bytes via sws_scale call in video_sampler.cpp
8586
int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
8687
if (!sampler_) {
8788
sampler_ = std::make_unique<VideoSampler>(SWS_AREA, loggingUuid_);
@@ -112,7 +113,9 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) {
112113
<< ", minDimension: " << format_.format.video.minDimension
113114
<< ", crop: " << format_.format.video.cropImage;
114115
}
115-
116+
// calls to a sampler that converts the frame from YUV422 to RGB24, and
117+
// optionally crops and resizes the frame. Frame bytes are copied from
118+
// frame_->data to out buffer
116119
return sampler_->sample(flush ? nullptr : frame_, out);
117120
}
118121

0 commit comments

Comments
 (0)