pytorch · fmassa · Oct 6, 2020 · Aug 19, 2020 · Aug 19, 2020 · Aug 19, 2020
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
@@ -1,4 +1,5 @@
 channels:
+  - pytorch
   - defaults
 dependencies:
   - numpy
@@ -8,6 +9,7 @@ dependencies:
   - pip
   - libpng
   - jpeg
+  - ffmpeg=4.2
   - ca-certificates
   - pip:
     - future

diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
@@ -1,4 +1,5 @@
 channels:
+  - pytorch
   - defaults
 dependencies:
   - numpy

diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
@@ -32,6 +32,8 @@ else
     cp "/usr/lib64/libjpeg.so" torchvision
 fi
 
+download_copy_ffmpeg
+
 if [[ "$OSTYPE" == "msys" ]]; then
     IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
 else

diff --git a/packaging/conda/build_vision.sh b/packaging/conda/build_vision.sh
@@ -127,7 +127,7 @@ else
 fi
 
 if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly -c pytorch"
     export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
                                 python -c "import os, sys, json, re; cuver = '$cuver'; \
                                 cuver = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \

diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
@@ -240,7 +240,7 @@ setup_pip_pytorch_version() {
 # You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
 setup_conda_pytorch_constraint() {
   if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly -c pytorch"
     export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
                               python -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
                                cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
@@ -350,3 +350,39 @@ setup_junit_results_folder() {
     export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml"
   fi
 }
+
+
+download_copy_ffmpeg() {
+  mkdir ffmpeg_tmp
+  cd ffmpeg_tmp
+  if [[ "$OSTYPE" == "msys" ]]; then
+    # conda install -yq ffmpeg -c pytorch
+    # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2
+    # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=-
+    # cp Library/bin/*.dll ../torchvision
+    echo "FFmpeg is disabled currently on Windows"
+  else
+    if [[ "$(uname)" == Darwin ]]; then
+      conda install -yq ffmpeg=4.2 -c pytorch
+      conda install -yq wget
+      wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/osx-64/ffmpeg-4.2-h0a44026_0.tar.bz2
+      tar -xjvf ffmpeg-4.2-h0a44026_0.tar.bz2
+      for f in lib/*.dylib; do
+        if [[ $f =~ ([a-z])+\.dylib ]]; then
+          cp $f ../torchvision
+        fi
+      done
+    else
+      wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2
+      tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2
+      cp lib/*.so ../torchvision
+      cp -r lib/* /usr/lib
+      cp -r bin/* /usr/bin
+      cp -r include/* /usr/include
+      ldconfig
+      which ffmpeg
+    fi
+  fi
+  cd ..
+  rm -rf ffmpeg_tmp
+}
diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml
@@ -1,3 +1,5 @@
+channel_sources:
+  - pytorch-nightly,pytorch,defaults
 blas_impl:
   - mkl                        # [x86_64]
 c_compiler:

diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml
@@ -10,6 +10,7 @@ requirements:
     - {{ compiler('c') }} # [win]
     - libpng
     - jpeg
+    - ffmpeg =4.2  # [not win]
 
   host:
     - python
@@ -21,6 +22,7 @@ requirements:
   run:
     - python
     - libpng
+    - ffmpeg =4.2  # [not win]
     - jpeg
     - pillow >=4.1.1
     - numpy >=1.11
@@ -48,7 +50,7 @@ test:
   requires:
     - pytest
     - scipy
-    - av
+    - av =8.0.1
     - ca-certificates
     {{ environ.get('CONDA_TYPING_CONSTRAINT') }}
 

diff --git a/setup.py b/setup.py
@@ -337,7 +337,9 @@ def get_extensions():
         ffmpeg_bin = os.path.dirname(ffmpeg_exe)
         ffmpeg_root = os.path.dirname(ffmpeg_bin)
         ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')
+        ffmpeg_library_dir = os.path.join(ffmpeg_root, 'lib')
         print("ffmpeg include path: {}".format(ffmpeg_include_dir))
+        print("ffmpeg library_dir: {}".format(ffmpeg_library_dir))
 
         # TorchVision base decoder + video reader
         video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
@@ -360,16 +362,16 @@ def get_extensions():
                     ffmpeg_include_dir,
                     extensions_dir,
                 ],
-                library_dirs=library_dirs,
+                library_dirs=[ffmpeg_library_dir] + library_dirs,
                 libraries=[
                     'avcodec',
                     'avformat',
                     'avutil',
                     'swresample',
                     'swscale',
                 ],
-                extra_compile_args=["-std=c++14"],
-                extra_link_args=["-std=c++14"],
+                extra_compile_args=["-std=c++14"] if os.name != 'nt' else ['/std:c++14', '/MP'],
+                extra_link_args=["-std=c++14" if os.name != 'nt' else '/std:c++14'],
             )
         )
 

diff --git a/test/test_datasets_video_utils_opt.py b/test/test_datasets_video_utils_opt.py
@@ -2,8 +2,8 @@
 from torchvision import set_video_backend
 import test_datasets_video_utils
 
-
-set_video_backend('video_reader')
+# Disabling the video backend switching temporarily
+# set_video_backend('video_reader')
 
 
 if __name__ == '__main__':

diff --git a/test/test_io_opt.py b/test/test_io_opt.py
@@ -3,7 +3,8 @@
 import test_io
 
 
-set_video_backend('video_reader')
+# Disabling the video backend switching temporarily
+# set_video_backend('video_reader')
 
 
 if __name__ == '__main__':

diff --git a/torchvision/csrc/cpu/decoder/decoder.h b/torchvision/csrc/cpu/decoder/decoder.h
@@ -5,6 +5,11 @@
 #include "seekable_buffer.h"
 #include "stream.h"
 
+#if defined(_MSC_VER)
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
 namespace ffmpeg {
 
 /**

diff --git a/torchvision/csrc/cpu/decoder/stream.cpp b/torchvision/csrc/cpu/decoder/stream.cpp
@@ -3,6 +3,7 @@
 #include "util.h"
 
 namespace ffmpeg {
+const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
 
 Stream::Stream(
     AVFormatContext* inputCtx,
@@ -85,7 +86,7 @@ int Stream::openCodec(std::vector<DecoderMetadata>* metadata) {
     header.num = steam->time_base.num;
     header.den = steam->time_base.den;
     header.duration =
-        av_rescale_q(steam->duration, steam->time_base, AV_TIME_BASE_Q);
+        av_rescale_q(steam->duration, steam->time_base, timeBaseQ);
     metadata->push_back(header);
   }
 
@@ -238,7 +239,7 @@ void Stream::setFramePts(DecoderHeader* header, bool flush) {
       header->pts = av_rescale_q(
           header->pts,
           inputCtx_->streams[format_.stream]->time_base,
-          AV_TIME_BASE_Q);
+          timeBaseQ);
     }
 
     switch (format_.type) {

diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp
@@ -4,6 +4,7 @@
 #include "util.h"
 
 namespace ffmpeg {
+const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
 
 SubtitleStream::SubtitleStream(
     AVFormatContext* inputCtx,
@@ -65,7 +66,7 @@ int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) {
   // set proper pts in us
   if (gotFramePtr) {
     sub_.pts = av_rescale_q(
-        pkt.pts, inputCtx_->streams[format_.stream]->time_base, AV_TIME_BASE_Q);
+        pkt.pts, inputCtx_->streams[format_.stream]->time_base, timeBaseQ);
   }
 
   return result;

diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.cpp b/torchvision/csrc/cpu/video_reader/VideoReader.cpp
@@ -29,6 +29,7 @@ namespace video_reader {
 
 const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24;
 const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT;
+const AVRational timeBaseQ = AVRational{1, AV_TIME_BASE};
 const size_t decoderTimeoutMs = 600000;
 // A jitter can be added to the end of the range to avoid conversion/rounding
 // error, small value 100us won't be enough to select the next frame, but enough
@@ -99,8 +100,8 @@ size_t fillTensor(
   for (size_t i = 0; i < msgs.size(); ++i) {
     const auto& msg = msgs[i];
     // convert pts into original time_base
-    AVRational avr = {(int)num, (int)den};
-    framePtsData[i] = av_rescale_q(msg.header.pts, AV_TIME_BASE_Q, avr);
+    AVRational avr = AVRational{(int)num, (int)den};
+    framePtsData[i] = av_rescale_q(msg.header.pts, timeBaseQ, avr);
     VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts
             << ", original: " << framePtsData[i];
 
@@ -156,28 +157,26 @@ void offsetsToUs(
   videoEndUs = -1;
 
   if (readVideoStream) {
-    AVRational vr = {(int)videoTimeBaseNum, (int)videoTimeBaseDen};
+    AVRational vr = AVRational{(int)videoTimeBaseNum, (int)videoTimeBaseDen};
     if (videoStartPts > 0) {
-      videoStartUs = av_rescale_q(videoStartPts, vr, AV_TIME_BASE_Q);
+      videoStartUs = av_rescale_q(videoStartPts, vr, timeBaseQ);
     }
     if (videoEndPts > 0) {
       // Add jitter to the end of the range to avoid conversion/rounding error.
       // Small value 100us won't be enough to select the next frame, but enough
       // to compensate rounding error due to the multiple conversions.
-      videoEndUs =
-          timeBaseJitterUs + av_rescale_q(videoEndPts, vr, AV_TIME_BASE_Q);
+      videoEndUs = timeBaseJitterUs + av_rescale_q(videoEndPts, vr, timeBaseQ);
     }
   } else if (readAudioStream) {
-    AVRational ar = {(int)audioTimeBaseNum, (int)audioTimeBaseDen};
+    AVRational ar = AVRational{(int)audioTimeBaseNum, (int)audioTimeBaseDen};
     if (audioStartPts > 0) {
-      videoStartUs = av_rescale_q(audioStartPts, ar, AV_TIME_BASE_Q);
+      videoStartUs = av_rescale_q(audioStartPts, ar, timeBaseQ);
     }
     if (audioEndPts > 0) {
       // Add jitter to the end of the range to avoid conversion/rounding error.
       // Small value 100us won't be enough to select the next frame, but enough
       // to compensate rounding error due to the multiple conversions.
-      videoEndUs =
-          timeBaseJitterUs + av_rescale_q(audioEndPts, ar, AV_TIME_BASE_Q);
+      videoEndUs = timeBaseJitterUs + av_rescale_q(audioEndPts, ar, timeBaseQ);
     }
   }
 }
@@ -336,8 +335,8 @@ torch::List<torch::Tensor> readVideo(
 
       videoDuration = torch::zeros({1}, torch::kLong);
       int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-      AVRational vr = {(int)header.num, (int)header.den};
-      videoDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, vr);
+      AVRational vr = AVRational{(int)header.num, (int)header.den};
+      videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, vr);
       VLOG(1) << "Video decoding from " << logType << " [" << logMessage
               << "] filled video tensors";
     } else {
@@ -398,8 +397,8 @@ torch::List<torch::Tensor> readVideo(
 
       audioDuration = torch::zeros({1}, torch::kLong);
       int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-      AVRational ar = {(int)header.num, (int)header.den};
-      audioDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, ar);
+      AVRational ar = AVRational{(int)header.num, (int)header.den};
+      audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, ar);
       VLOG(1) << "Video decoding from " << logType << " [" << logMessage
               << "] filled audio tensors";
     } else {
@@ -598,8 +597,8 @@ torch::List<torch::Tensor> probeVideo(
 
     videoDuration = torch::zeros({1}, torch::kLong);
     int64_t* videoDurationData = videoDuration.data_ptr<int64_t>();
-    AVRational avr = {(int)header.num, (int)header.den};
-    videoDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, avr);
+    AVRational avr = AVRational{(int)header.num, (int)header.den};
+    videoDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
 
     VLOG(2) << "Prob fps: " << header.fps << ", duration: " << header.duration
             << ", num: " << header.num << ", den: " << header.den;
@@ -631,8 +630,8 @@ torch::List<torch::Tensor> probeVideo(
 
     audioDuration = torch::zeros({1}, torch::kLong);
     int64_t* audioDurationData = audioDuration.data_ptr<int64_t>();
-    AVRational avr = {(int)header.num, (int)header.den};
-    audioDurationData[0] = av_rescale_q(header.duration, AV_TIME_BASE_Q, avr);
+    AVRational avr = AVRational{(int)header.num, (int)header.den};
+    audioDurationData[0] = av_rescale_q(header.duration, timeBaseQ, avr);
 
     VLOG(2) << "Prob sample rate: " << format.samples
             << ", duration: " << header.duration << ", num: " << header.num

diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
@@ -88,7 +88,7 @@ def _validate_pts(pts_range):
         assert (
             pts_range[0] <= pts_range[1]
         ), """Start pts should not be smaller than end pts, got
-            start pts: %d and end pts: %d""" % (
+            start pts: {0:d} and end pts: {1:d}""".format(
             pts_range[0],
             pts_range[1],
         )