diff --git a/.circleci/.gitignore b/.circleci/.gitignore deleted file mode 100644 index 485dee64bcf..00000000000 --- a/.circleci/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.idea diff --git a/.circleci/build_docs/commit_docs.sh b/.circleci/build_docs/commit_docs.sh deleted file mode 100755 index 04e3538fefc..00000000000 --- a/.circleci/build_docs/commit_docs.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -set -ex - - -if [ "$2" == "" ]; then - echo call as "$0" "" "" - echo where src is the root of the built documentation git checkout and - echo branch should be "main" or "1.7" or so - exit 1 -fi - -src=$1 -target=$2 - -echo "committing docs from ${src} to ${target}" - -pushd "${src}" -git checkout gh-pages -mkdir -p ./"${target}" -rm -rf ./"${target}"/* -cp -r "${src}/docs/build/html/"* ./"$target" -if [ "${target}" == "main" ]; then - mkdir -p ./_static - rm -rf ./_static/* - cp -r "${src}/docs/build/html/_static/"* ./_static - git add --all ./_static || true -fi -git add --all ./"${target}" || true -git config user.email "soumith+bot@pytorch.org" -git config user.name "pytorchbot" -# If there aren't changes, don't make a commit; push is no-op -git commit -m "auto-generating sphinx docs" || true -git remote add https https://github.com/pytorch/vision.git -git push -u https gh-pages diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 4262656ae91..00000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,3300 +0,0 @@ -version: 2.1 - -# How to test the Linux jobs: -# - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/ -# - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7 -# - Replace binary_linux_wheel_py3.7 with the name of the job you want to test. -# Job names are 'name:' key. - -executors: - windows-cpu: - machine: - resource_class: windows.xlarge - image: windows-server-2019-vs2019:stable - shell: bash.exe - - windows-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -commands: - checkout_merge: - description: "checkout merge branch" - steps: - - checkout -# - run: -# name: Checkout merge branch -# command: | -# set -ex -# BRANCH=$(git rev-parse --abbrev-ref HEAD) -# if [[ "$BRANCH" != "main" ]]; then -# git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH} -# git checkout "merged/$CIRCLE_BRANCH" -# fi - designate_upload_channel: - description: "inserts the correct upload channel into ${BASH_ENV}" - steps: - - run: - name: adding UPLOAD_CHANNEL to BASH_ENV - command: | - our_upload_channel=nightly - # On tags upload to test instead - if [[ -n "${CIRCLE_TAG}" ]]; then - our_upload_channel=test - fi - echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV} - - brew_update: - description: "Update Homebrew and install base formulae" - steps: - - run: - name: Update Homebrew - no_output_timeout: "10m" - command: | - set -ex - - # Update repositories manually. - # Running `brew update` produces a comparison between the - # current checkout and the updated checkout, which takes a - # very long time because the existing checkout is 2y old. - for path in $(find /usr/local/Homebrew -type d -name .git) - do - cd $path/.. - git fetch --depth=1 origin - git reset --hard origin/master - done - - export HOMEBREW_NO_AUTO_UPDATE=1 - - # Install expect and moreutils so that we can call `unbuffer` and `ts`. - # moreutils installs a `parallel` executable by default, which conflicts - # with the executable from the GNU `parallel`, so we must unlink GNU - # `parallel` first, and relink it afterwards. - brew install coreutils - brew unlink parallel - brew install moreutils - brew link parallel --overwrite - brew install expect - - brew_install: - description: "Install Homebrew formulae" - parameters: - formulae: - type: string - default: "" - steps: - - run: - name: Install << parameters.formulae >> - no_output_timeout: "10m" - command: | - set -ex - export HOMEBREW_NO_AUTO_UPDATE=1 - brew install << parameters.formulae >> - - run_brew_for_ios_build: - steps: - - brew_update - - brew_install: - formulae: libtool - - apt_install: - parameters: - args: - type: string - descr: - type: string - default: "" - update: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> apt install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: | - <<# parameters.update >> sudo apt update -qy <> - sudo apt install << parameters.args >> - - pip_install: - parameters: - args: - type: string - descr: - type: string - default: "" - user: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> pip install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: > - pip install - <<# parameters.user >> --user <> - --progress-bar=off - << parameters.args >> - - install_torchvision: - parameters: - editable: - type: boolean - default: true - steps: - - pip_install: - args: --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu - descr: Install PyTorch from nightly releases - - pip_install: - args: --no-build-isolation <<# parameters.editable >> --editable <> . - descr: Install torchvision <<# parameters.editable >> in editable mode <> - - install_prototype_dependencies: - steps: - - pip_install: - args: iopath - descr: Install third-party dependencies - - pip_install: - args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu - descr: Install torchdata from nightly releases - - # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup. - # This command can be used if only a selection of tests need to be run, for ad-hoc files. - run_tests_selective: - parameters: - file_or_dir: - type: string - steps: - - run: - name: Install test utilities - command: pip install --progress-bar=off pytest pytest-mock - - run: - name: Run tests - command: pytest --junitxml=test-results/junit.xml -v --durations 20 <> - - store_test_results: - path: test-results - - download_model_weights: - parameters: - extract_roots: - type: string - default: "torchvision/models" - background: - type: boolean - default: true - steps: - - apt_install: - args: parallel wget - descr: Install download utilitites - - run: - name: Download model weights - background: << parameters.background >> - command: | - mkdir -p ~/.cache/torch/hub/checkpoints - python scripts/collect_model_urls.py << parameters.extract_roots >> \ - | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci' - -binary_common: &binary_common - parameters: - # Edit these defaults to do a release - build_version: - description: "version number of release binary; by default, build a nightly" - type: string - default: "" - pytorch_version: - description: "PyTorch version to build against; by default, use a nightly" - type: string - default: "" - # Don't edit these - python_version: - description: "Python version to build against (e.g., 3.7)" - type: string - cu_version: - description: "CUDA version to build against, in CU format (e.g., cpu or cu100)" - type: string - default: "cpu" - unicode_abi: - description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)" - type: string - default: "" - wheel_docker_image: - description: "Wheel only: what docker image to use" - type: string - default: "pytorch/manylinux-cuda102" - conda_docker_image: - description: "Conda only: what docker image to use" - type: string - default: "pytorch/conda-builder:cpu" - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - MACOSX_DEPLOYMENT_TARGET: 10.9 - -torchvision_ios_params: &torchvision_ios_params - parameters: - build_environment: - type: string - default: "" - ios_arch: - type: string - default: "" - ios_platform: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - IOS_ARCH: << parameters.ios_arch >> - IOS_PLATFORM: << parameters.ios_platform >> - -torchvision_android_params: &torchvision_android_params - parameters: - build_environment: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - -smoke_test_common: &smoke_test_common - <<: *binary_common - docker: - - image: torchvision/smoke_test:latest - -jobs: - circleci_consistency: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: jinja2 pyyaml - - run: - name: Check CircleCI config consistency - command: | - python .circleci/regenerate.py - git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1) - - lint_python_and_config: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: pre-commit - descr: Install lint utilities - - run: - name: Install pre-commit hooks - command: pre-commit install-hooks - - run: - name: Lint Python code and config files - command: pre-commit run --all-files - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - lint_c: - docker: - - image: cimg/python:3.7 - steps: - - apt_install: - args: libtinfo5 - descr: Install additional system libraries - - checkout - - run: - name: Install lint utilities - command: | - curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format - chmod +x clang-format - sudo mv clang-format /opt/clang-format - - run: - name: Lint C code - command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - type_check_python: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision: - editable: true - - install_prototype_dependencies - - pip_install: - args: mypy - descr: Install Python type check utilities - - run: - name: Check Python types statically - command: mypy --install-types --non-interactive --config-file mypy.ini - - unittest_torchhub: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - run_tests_selective: - file_or_dir: test/test_hub.py - - unittest_onnx: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - pip_install: - args: onnx onnxruntime - descr: Install ONNX - - run_tests_selective: - file_or_dir: test/test_onnx.py - - unittest_extended: - docker: - - image: cimg/python:3.7 - resource_class: xlarge - steps: - - checkout - - download_model_weights - - install_torchvision - - run: - name: Enable extended tests - command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV - - run_tests_selective: - file_or_dir: test/test_extended_*.py - - binary_linux_wheel: - <<: *binary_common - docker: - - image: << parameters.wheel_docker_image >> - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_linux_conda: - <<: *binary_common - docker: - - image: "<< parameters.conda_docker_image >>" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: packaging/build_conda.sh - - store_artifacts: - path: /opt/conda/conda-bld/linux-64 - - persist_to_workspace: - root: /opt/conda/conda-bld/linux-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_conda: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 20m - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate base - conda install -yq conda-build "conda-package-handling!=1.5.0" - # cudatoolkit >= 11 isn't available for windows in the nvidia channel - if [[ "${CU_VERSION}" =~ cu11.* ]]; then - export CONDA_CHANNEL_FLAGS="-c conda-forge" - fi - packaging/build_conda.sh - rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2 - - store_artifacts: - path: C:/tools/miniconda3/conda-bld/win-64 - - persist_to_workspace: - root: C:/tools/miniconda3/conda-bld/win-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_wheel: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build wheel packages - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_macos_wheel: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - # Cannot easily deduplicate this as source'ing activate - # will set environment variables which we need to propagate - # to build_wheel.sh - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_ios_build: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh" - cat "$script" - source "$script" - - persist_to_workspace: - root: /Users/distiller/workspace/ - paths: ios - - binary_ios_upload: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh" - cat "$script" - source "$script" - - binary_android_build: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh" - cat "$script" - source "$script" - - store_artifacts: - path: ~/workspace/artifacts - - binary_android_upload: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh" - cat "$script" - source "$script" - - binary_macos_conda: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build - packaging/build_conda.sh - - store_artifacts: - path: /Users/distiller/miniconda3/conda-bld/osx-64 - - persist_to_workspace: - root: /Users/distiller/miniconda3/conda-bld/osx-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - # Requires org-member context - binary_conda_upload: - docker: - - image: continuumio/miniconda - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - command: | - # Prevent credential from leaking - conda install -yq anaconda-client - set -x - anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force - - # Requires org-member context - binary_wheel_upload: - parameters: - subfolder: - description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)" - type: string - docker: - - image: cimg/python:3.7 - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - checkout - - pip_install: - args: awscli - - run: - command: | - export PATH="$HOME/.local/bin:$PATH" - # Prevent credential from leaking - set +x - export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}" - set -x - for pkg in ~/workspace/*.whl; do - aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read - done - - smoke_test_linux_conda: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_linux_pip: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_docker_image_build: - machine: - image: ubuntu-2004:202104-01 - resource_class: large - environment: - image_name: torchvision/smoke_test - steps: - - checkout - - designate_upload_channel - - run: - name: Build and push Docker image - no_output_timeout: "1h" - command: | - set +x - echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin - set -x - cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID} - docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest - docker push ${image_name}:${CIRCLE_WORKFLOW_ID} - docker push ${image_name}:latest - - smoke_test_win_conda: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda env remove -n python${PYTHON_VERSION} || true - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_win_pip: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - unittest_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda102" - resource_class: 2xlarge+ - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - - key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.medium - environment: - image_name: "pytorch/manylinux-cuda102" - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - - key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - # Here we create an envlist file that contains some env variables that we want the docker container to be aware of. - # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables. - # They're avaiable in all the other workflows (OSX and Windows). - # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible. - # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run". - name: export CIRCLECI env var - command: echo "CIRCLECI=true" >> ./env.list - - run: - name: Install torchvision - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post Process - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - - key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - environment: - CUDA_VERSION: "11.3" - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - - key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install CUDA - command: packaging/windows/internal/cuda_install.bat - - run: - name: Update CUDA driver - command: packaging/windows/internal/driver_update.bat - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - resource_class: large - steps: - - checkout - - designate_upload_channel - - run: - name: Install wget - command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget - # Disable brew auto update which is very slow - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - - key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - cmake_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda102" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: .circleci/unittest/linux/scripts/setup_env.sh - - run: packaging/build_cmake.sh - - cmake_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.small - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh - - run: - name: Build torchvision C++ distribution and test - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh - - cmake_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build cmake - packaging/build_cmake.sh - - cmake_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/build_cmake.sh - - cmake_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_cmake.sh - - build_docs: - <<: *binary_common - docker: - - image: cimg/python:3.7 - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - checkout - - download_model_weights - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - designate_upload_channel - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Build docs - command: | - set -ex - # turn v1.12.0rc3 into 1.12.0 - tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/') - VERSION=${tag:-main} - eval "$(./conda/bin/conda shell.bash hook)" - conda activate ./env - pushd docs - pip install --progress-bar=off -r requirements.txt - make html - popd - - persist_to_workspace: - root: ./ - paths: - - "*" - - store_artifacts: - path: ./docs/build/html - destination: docs - - upload_docs: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda100" - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - run: - name: Generate netrc - command: | - # set credentials for https pushing - # requires the org-member context - cat > ~/.netrc \< gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7 -# - Replace binary_linux_wheel_py3.7 with the name of the job you want to test. -# Job names are 'name:' key. - -executors: - windows-cpu: - machine: - resource_class: windows.xlarge - image: windows-server-2019-vs2019:stable - shell: bash.exe - - windows-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -commands: - checkout_merge: - description: "checkout merge branch" - steps: - - checkout -# - run: -# name: Checkout merge branch -# command: | -# set -ex -# BRANCH=$(git rev-parse --abbrev-ref HEAD) -# if [[ "$BRANCH" != "main" ]]; then -# git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH} -# git checkout "merged/$CIRCLE_BRANCH" -# fi - designate_upload_channel: - description: "inserts the correct upload channel into ${BASH_ENV}" - steps: - - run: - name: adding UPLOAD_CHANNEL to BASH_ENV - command: | - our_upload_channel=nightly - # On tags upload to test instead - if [[ -n "${CIRCLE_TAG}" ]]; then - our_upload_channel=test - fi - echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV} - - brew_update: - description: "Update Homebrew and install base formulae" - steps: - - run: - name: Update Homebrew - no_output_timeout: "10m" - command: | - set -ex - - # Update repositories manually. - # Running `brew update` produces a comparison between the - # current checkout and the updated checkout, which takes a - # very long time because the existing checkout is 2y old. - for path in $(find /usr/local/Homebrew -type d -name .git) - do - cd $path/.. - git fetch --depth=1 origin - git reset --hard origin/master - done - - export HOMEBREW_NO_AUTO_UPDATE=1 - - # Install expect and moreutils so that we can call `unbuffer` and `ts`. - # moreutils installs a `parallel` executable by default, which conflicts - # with the executable from the GNU `parallel`, so we must unlink GNU - # `parallel` first, and relink it afterwards. - brew install coreutils - brew unlink parallel - brew install moreutils - brew link parallel --overwrite - brew install expect - - brew_install: - description: "Install Homebrew formulae" - parameters: - formulae: - type: string - default: "" - steps: - - run: - name: Install << parameters.formulae >> - no_output_timeout: "10m" - command: | - set -ex - export HOMEBREW_NO_AUTO_UPDATE=1 - brew install << parameters.formulae >> - - run_brew_for_ios_build: - steps: - - brew_update - - brew_install: - formulae: libtool - - apt_install: - parameters: - args: - type: string - descr: - type: string - default: "" - update: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> apt install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: | - <<# parameters.update >> sudo apt update -qy <> - sudo apt install << parameters.args >> - - pip_install: - parameters: - args: - type: string - descr: - type: string - default: "" - user: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> pip install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: > - pip install - <<# parameters.user >> --user <> - --progress-bar=off - << parameters.args >> - - install_torchvision: - parameters: - editable: - type: boolean - default: true - steps: - - pip_install: - args: --pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cpu - descr: Install PyTorch from nightly releases - - pip_install: - args: --no-build-isolation <<# parameters.editable >> --editable <> . - descr: Install torchvision <<# parameters.editable >> in editable mode <> - - install_prototype_dependencies: - steps: - - pip_install: - args: iopath - descr: Install third-party dependencies - - pip_install: - args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu - descr: Install torchdata from nightly releases - - # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup. - # This command can be used if only a selection of tests need to be run, for ad-hoc files. - run_tests_selective: - parameters: - file_or_dir: - type: string - steps: - - run: - name: Install test utilities - command: pip install --progress-bar=off pytest pytest-mock - - run: - name: Run tests - command: pytest --junitxml=test-results/junit.xml -v --durations 20 <> - - store_test_results: - path: test-results - - download_model_weights: - parameters: - extract_roots: - type: string - default: "torchvision/models" - background: - type: boolean - default: true - steps: - - apt_install: - args: parallel wget - descr: Install download utilitites - - run: - name: Download model weights - background: << parameters.background >> - command: | - mkdir -p ~/.cache/torch/hub/checkpoints - python scripts/collect_model_urls.py << parameters.extract_roots >> \ - | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci' - -binary_common: &binary_common - parameters: - # Edit these defaults to do a release - build_version: - description: "version number of release binary; by default, build a nightly" - type: string - default: "" - pytorch_version: - description: "PyTorch version to build against; by default, use a nightly" - type: string - default: "" - # Don't edit these - python_version: - description: "Python version to build against (e.g., 3.7)" - type: string - cu_version: - description: "CUDA version to build against, in CU format (e.g., cpu or cu100)" - type: string - default: "cpu" - unicode_abi: - description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)" - type: string - default: "" - wheel_docker_image: - description: "Wheel only: what docker image to use" - type: string - default: "pytorch/manylinux-cuda102" - conda_docker_image: - description: "Conda only: what docker image to use" - type: string - default: "pytorch/conda-builder:cpu" - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - MACOSX_DEPLOYMENT_TARGET: 10.9 - -torchvision_ios_params: &torchvision_ios_params - parameters: - build_environment: - type: string - default: "" - ios_arch: - type: string - default: "" - ios_platform: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - IOS_ARCH: << parameters.ios_arch >> - IOS_PLATFORM: << parameters.ios_platform >> - -torchvision_android_params: &torchvision_android_params - parameters: - build_environment: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - -smoke_test_common: &smoke_test_common - <<: *binary_common - docker: - - image: torchvision/smoke_test:latest - -jobs: - circleci_consistency: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: jinja2 pyyaml - - run: - name: Check CircleCI config consistency - command: | - python .circleci/regenerate.py - git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1) - - lint_python_and_config: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: pre-commit - descr: Install lint utilities - - run: - name: Install pre-commit hooks - command: pre-commit install-hooks - - run: - name: Lint Python code and config files - command: pre-commit run --all-files - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - lint_c: - docker: - - image: cimg/python:3.7 - steps: - - apt_install: - args: libtinfo5 - descr: Install additional system libraries - - checkout - - run: - name: Install lint utilities - command: | - curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format - chmod +x clang-format - sudo mv clang-format /opt/clang-format - - run: - name: Lint C code - command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - type_check_python: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision: - editable: true - - install_prototype_dependencies - - pip_install: - args: mypy - descr: Install Python type check utilities - - run: - name: Check Python types statically - command: mypy --install-types --non-interactive --config-file mypy.ini - - unittest_torchhub: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - run_tests_selective: - file_or_dir: test/test_hub.py - - unittest_onnx: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - pip_install: - args: onnx onnxruntime - descr: Install ONNX - - run_tests_selective: - file_or_dir: test/test_onnx.py - - unittest_extended: - docker: - - image: cimg/python:3.7 - resource_class: xlarge - steps: - - checkout - - download_model_weights - - install_torchvision - - run: - name: Enable extended tests - command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV - - run_tests_selective: - file_or_dir: test/test_extended_*.py - - binary_linux_wheel: - <<: *binary_common - docker: - - image: << parameters.wheel_docker_image >> - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_linux_conda: - <<: *binary_common - docker: - - image: "<< parameters.conda_docker_image >>" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: packaging/build_conda.sh - - store_artifacts: - path: /opt/conda/conda-bld/linux-64 - - persist_to_workspace: - root: /opt/conda/conda-bld/linux-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_conda: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 20m - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate base - conda install -yq conda-build "conda-package-handling!=1.5.0" - # cudatoolkit >= 11 isn't available for windows in the nvidia channel - if [[ "${CU_VERSION}" =~ cu11.* ]]; then - export CONDA_CHANNEL_FLAGS="-c conda-forge" - fi - packaging/build_conda.sh - rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2 - - store_artifacts: - path: C:/tools/miniconda3/conda-bld/win-64 - - persist_to_workspace: - root: C:/tools/miniconda3/conda-bld/win-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_wheel: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build wheel packages - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_macos_wheel: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - # Cannot easily deduplicate this as source'ing activate - # will set environment variables which we need to propagate - # to build_wheel.sh - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_ios_build: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh" - cat "$script" - source "$script" - - persist_to_workspace: - root: /Users/distiller/workspace/ - paths: ios - - binary_ios_upload: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh" - cat "$script" - source "$script" - - binary_android_build: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh" - cat "$script" - source "$script" - - store_artifacts: - path: ~/workspace/artifacts - - binary_android_upload: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh" - cat "$script" - source "$script" - - binary_macos_conda: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build - packaging/build_conda.sh - - store_artifacts: - path: /Users/distiller/miniconda3/conda-bld/osx-64 - - persist_to_workspace: - root: /Users/distiller/miniconda3/conda-bld/osx-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - # Requires org-member context - binary_conda_upload: - docker: - - image: continuumio/miniconda - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - command: | - # Prevent credential from leaking - conda install -yq anaconda-client - set -x - anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force - - # Requires org-member context - binary_wheel_upload: - parameters: - subfolder: - description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)" - type: string - docker: - - image: cimg/python:3.7 - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - checkout - - pip_install: - args: awscli - - run: - command: | - export PATH="$HOME/.local/bin:$PATH" - # Prevent credential from leaking - set +x - export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}" - set -x - for pkg in ~/workspace/*.whl; do - aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read - done - - smoke_test_linux_conda: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_linux_pip: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_docker_image_build: - machine: - image: ubuntu-2004:202104-01 - resource_class: large - environment: - image_name: torchvision/smoke_test - steps: - - checkout - - designate_upload_channel - - run: - name: Build and push Docker image - no_output_timeout: "1h" - command: | - set +x - echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin - set -x - cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID} - docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest - docker push ${image_name}:${CIRCLE_WORKFLOW_ID} - docker push ${image_name}:latest - - smoke_test_win_conda: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda env remove -n python${PYTHON_VERSION} || true - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_win_pip: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/nightly/torch_nightly.html - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - unittest_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda102" - resource_class: 2xlarge+ - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.medium - environment: - image_name: "pytorch/manylinux-cuda102" - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - # Here we create an envlist file that contains some env variables that we want the docker container to be aware of. - # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables. - # They're avaiable in all the other workflows (OSX and Windows). - # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible. - # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run". - name: export CIRCLECI env var - command: echo "CIRCLECI=true" >> ./env.list - - run: - name: Install torchvision - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post Process - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - environment: - CUDA_VERSION: "11.3" - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install CUDA - command: packaging/windows/internal/cuda_install.bat - - run: - name: Update CUDA driver - command: packaging/windows/internal/driver_update.bat - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - resource_class: large - steps: - - checkout - - designate_upload_channel - - run: - name: Install wget - command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget - # Disable brew auto update which is very slow - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - cmake_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda102" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: .circleci/unittest/linux/scripts/setup_env.sh - - run: packaging/build_cmake.sh - - cmake_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.small - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh - - run: - name: Build torchvision C++ distribution and test - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh - - cmake_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build cmake - packaging/build_cmake.sh - - cmake_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/build_cmake.sh - - cmake_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_cmake.sh - - build_docs: - <<: *binary_common - docker: - - image: cimg/python:3.7 - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - checkout - - download_model_weights - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - designate_upload_channel - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Build docs - command: | - set -ex - # turn v1.12.0rc3 into 1.12.0 - tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/') - VERSION=${tag:-main} - eval "$(./conda/bin/conda shell.bash hook)" - conda activate ./env - pushd docs - pip install --progress-bar=off -r requirements.txt - make html - popd - - persist_to_workspace: - root: ./ - paths: - - "*" - - store_artifacts: - path: ./docs/build/html - destination: docs - - upload_docs: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda100" - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - run: - name: Generate netrc - command: | - # set credentials for https pushing - # requires the org-member context - cat > ~/.netrc \<> ~/.bashrc -CMD [ "/bin/bash"] diff --git a/.circleci/unittest/android/scripts/binary_android_build.sh b/.circleci/unittest/android/scripts/binary_android_build.sh deleted file mode 100644 index 0d8c0d47d8a..00000000000 --- a/.circleci/unittest/android/scripts/binary_android_build.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "DIR: $(pwd)" -echo "ANDROID_HOME=${ANDROID_HOME}" -echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}" -echo "JAVA_HOME=${JAVA_HOME}" - -WORKSPACE=/home/circleci/workspace -VISION_ANDROID=/home/circleci/project/android - -. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh - -GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties -rm -f $GRADLE_LOCAL_PROPERTIES - -echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES -echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES - -echo "GRADLE_PATH $GRADLE_PATH" -echo "GRADLE_HOME $GRADLE_HOME" - -${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} assemble || true - -mkdir -p ~/workspace/artifacts -find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz -find . -type f -name *apk -print | xargs tar cfvz ~/workspace/artifacts/artifacts-apks.tgz diff --git a/.circleci/unittest/android/scripts/binary_android_upload.sh b/.circleci/unittest/android/scripts/binary_android_upload.sh deleted file mode 100644 index 1472a877d90..00000000000 --- a/.circleci/unittest/android/scripts/binary_android_upload.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "DIR: $(pwd)" -echo "ANDROID_HOME=${ANDROID_HOME}" -echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}" -echo "JAVA_HOME=${JAVA_HOME}" - -WORKSPACE=/home/circleci/workspace -VISION_ANDROID=/home/circleci/project/android - -. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh - -GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties -rm -f $GRADLE_LOCAL_PROPERTIES -GRADLE_PROPERTIES=/home/circleci/project/android/gradle.properties - -echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES -echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES - -echo "SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES -echo "mavenCentralRepositoryUsername=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES -echo "SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES -echo "mavenCentralRepositoryPassword=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES - -echo "signing.keyId=${ANDROID_SIGN_KEY}" >> $GRADLE_PROPERTIES -echo "signing.password=${ANDROID_SIGN_PASS}" >> $GRADLE_PROPERTIES - -cat /home/circleci/project/android/gradle.properties | grep VERSION - -${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} ops:uploadArchives - -mkdir -p ~/workspace/artifacts -find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz diff --git a/.circleci/unittest/android/scripts/install_gradle.sh b/.circleci/unittest/android/scripts/install_gradle.sh deleted file mode 100755 index 5f803abfa94..00000000000 --- a/.circleci/unittest/android/scripts/install_gradle.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -ex - -_https_amazon_aws=https://ossci-android.s3.amazonaws.com -GRADLE_VERSION=6.8.3 - -_gradle_home=/opt/gradle -sudo rm -rf $gradle_home -sudo mkdir -p $_gradle_home - -curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip - -sudo unzip -q /tmp/gradle.zip -d $_gradle_home -rm /tmp/gradle.zip - -sudo chmod -R 777 $_gradle_home - -export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION -export GRADLE_PATH=${GRADLE_HOME}/bin/gradle diff --git a/.circleci/unittest/ios/scripts/binary_ios_build.sh b/.circleci/unittest/ios/scripts/binary_ios_build.sh deleted file mode 100755 index e2ad7b0c55f..00000000000 --- a/.circleci/unittest/ios/scripts/binary_ios_build.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "" -echo "DIR: $(pwd)" -WORKSPACE=/Users/distiller/workspace -PROJ_ROOT_IOS=/Users/distiller/project/ios -PYTORCH_IOS_NIGHTLY_NAME=libtorch_ios_nightly_build.zip -export TCLLIBPATH="/usr/local/lib" - -# install conda -curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -chmod +x ~/conda.sh -/bin/bash ~/conda.sh -b -p ~/anaconda -export PATH="~/anaconda/bin:${PATH}" -source ~/anaconda/bin/activate - -# install dependencies -conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests typing_extensions wget --yes -conda install -c conda-forge valgrind --yes -export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - -# sync submodules -cd ${PROJ_ROOT_IOS} -git submodule sync -git submodule update --init --recursive - -# download pytorch-iOS nightly build and unzip it -mkdir -p ${PROJ_ROOT_IOS}/lib -mkdir -p ${PROJ_ROOT_IOS}/build -mkdir -p ${PROJ_ROOT_IOS}/pytorch -TORCH_ROOT="${PROJ_ROOT_IOS}/pytorch" - -cd ${TORCH_ROOT} -wget https://ossci-ios-build.s3.amazonaws.com/${PYTORCH_IOS_NIGHTLY_NAME} -mkdir -p ./build_ios -unzip -d ./build_ios ./${PYTORCH_IOS_NIGHTLY_NAME} - -LIBTORCH_HEADER_ROOT="${TORCH_ROOT}/build_ios/install/include" -cd ${PROJ_ROOT_IOS} -IOS_ARCH=${IOS_ARCH} LIBTORCH_HEADER_ROOT=${LIBTORCH_HEADER_ROOT} ./build_ios.sh -rm -rf ${TORCH_ROOT} - -# store the binary -DEST_DIR=${WORKSPACE}/ios/${IOS_ARCH} -mkdir -p ${DEST_DIR} -cp ${PROJ_ROOT_IOS}/lib/*.a ${DEST_DIR} diff --git a/.circleci/unittest/ios/scripts/binary_ios_upload.sh b/.circleci/unittest/ios/scripts/binary_ios_upload.sh deleted file mode 100644 index ce56388e5da..00000000000 --- a/.circleci/unittest/ios/scripts/binary_ios_upload.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "" -echo "DIR: $(pwd)" - -WORKSPACE=/Users/distiller/workspace -PROJ_ROOT=/Users/distiller/project -ARTIFACTS_DIR=${WORKSPACE}/ios -ls ${ARTIFACTS_DIR} -ZIP_DIR=${WORKSPACE}/zip -mkdir -p ${ZIP_DIR}/install/lib - -# build a FAT bianry -cd ${ZIP_DIR}/install/lib -libs=("${ARTIFACTS_DIR}/x86_64/libtorchvision_ops.a" "${ARTIFACTS_DIR}/arm64/libtorchvision_ops.a") -lipo -create "${libs[@]}" -o ${ZIP_DIR}/install/lib/libtorchvision_ops.a -lipo -i ${ZIP_DIR}/install/lib/*.a - -# copy the license -cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/ -# zip the library -ZIPFILE=libtorchvision_ops_ios_nightly_build.zip -cd ${ZIP_DIR} -#for testing -touch version.txt -echo $(date +%s) > version.txt -zip -r ${ZIPFILE} install version.txt LICENSE - -# upload to aws -# Install conda then 'conda install' awscli -curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -chmod +x ~/conda.sh -/bin/bash ~/conda.sh -b -p ~/anaconda -export PATH="~/anaconda/bin:${PATH}" -source ~/anaconda/bin/activate -conda install -c conda-forge awscli --yes -set +x -export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD} -export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD} -set -x -aws s3 cp ${ZIPFILE} s3://ossci-ios-build/ --acl public-read diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml deleted file mode 100644 index 77ee9929519..00000000000 --- a/.circleci/unittest/linux/scripts/environment.yml +++ /dev/null @@ -1,16 +0,0 @@ -channels: - - pytorch - - defaults -dependencies: - - pytest - - pytest-cov - - pytest-mock - - pip - - libpng - - jpeg - - ca-certificates - - h5py - - pip: - - future - - scipy - - av diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh deleted file mode 100755 index c9c85bdd88a..00000000000 --- a/.circleci/unittest/linux/scripts/install.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env bash - -unset PYTORCH_VERSION -# For unittest, nightly PyTorch is used as the following section, -# so no need to set PYTORCH_VERSION. -# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config. - -set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env - -if [ "${CU_VERSION:-}" == cpu ] ; then - cudatoolkit="cpuonly" - version="cpu" -else - if [[ ${#CU_VERSION} -eq 4 ]]; then - CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}" - elif [[ ${#CU_VERSION} -eq 5 ]]; then - CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}" - fi - echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION" - version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")" - - cudatoolkit="nvidia::cudatoolkit=${version}" - if [[ "$version" == "11.6" || "$version" == "11.7" ]]; then - cudatoolkit=" pytorch-cuda=${version}" - fi -fi - -case "$(uname -s)" in - Darwin*) os=MacOSX;; - *) os=Linux -esac - -printf "Installing PyTorch with %s\n" "${cudatoolkit}" -if [ "${os}" == "MacOSX" ]; then - conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}" -else - conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}" -fi - -printf "* Installing torchvision\n" -python setup.py develop diff --git a/.circleci/unittest/linux/scripts/post_process.sh b/.circleci/unittest/linux/scripts/post_process.sh deleted file mode 100755 index e97bf2a7b1b..00000000000 --- a/.circleci/unittest/linux/scripts/post_process.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env diff --git a/.circleci/unittest/linux/scripts/run-clang-format.py b/.circleci/unittest/linux/scripts/run-clang-format.py deleted file mode 100755 index 5c61b2519e0..00000000000 --- a/.circleci/unittest/linux/scripts/run-clang-format.py +++ /dev/null @@ -1,331 +0,0 @@ -#!/usr/bin/env python -""" -MIT License - -Copyright (c) 2017 Guillaume Papin - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -A wrapper script around clang-format, suitable for linting multiple files -and to use for continuous integration. - -This is an alternative API for the clang-format command line. -It runs over multiple files and directories in parallel. -A diff output is produced and a sensible exit code is returned. - -""" - -import argparse -import difflib -import fnmatch -import multiprocessing -import os -import signal -import subprocess -import sys -import traceback -from functools import partial - -try: - from subprocess import DEVNULL # py3k -except ImportError: - DEVNULL = open(os.devnull, "wb") - - -DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu" - - -class ExitStatus: - SUCCESS = 0 - DIFF = 1 - TROUBLE = 2 - - -def list_files(files, recursive=False, extensions=None, exclude=None): - if extensions is None: - extensions = [] - if exclude is None: - exclude = [] - - out = [] - for file in files: - if recursive and os.path.isdir(file): - for dirpath, dnames, fnames in os.walk(file): - fpaths = [os.path.join(dirpath, fname) for fname in fnames] - for pattern in exclude: - # os.walk() supports trimming down the dnames list - # by modifying it in-place, - # to avoid unnecessary directory listings. - dnames[:] = [x for x in dnames if not fnmatch.fnmatch(os.path.join(dirpath, x), pattern)] - fpaths = [x for x in fpaths if not fnmatch.fnmatch(x, pattern)] - for f in fpaths: - ext = os.path.splitext(f)[1][1:] - if ext in extensions: - out.append(f) - else: - out.append(file) - return out - - -def make_diff(file, original, reformatted): - return list( - difflib.unified_diff( - original, reformatted, fromfile=f"{file}\t(original)", tofile=f"{file}\t(reformatted)", n=3 - ) - ) - - -class DiffError(Exception): - def __init__(self, message, errs=None): - super().__init__(message) - self.errs = errs or [] - - -class UnexpectedError(Exception): - def __init__(self, message, exc=None): - super().__init__(message) - self.formatted_traceback = traceback.format_exc() - self.exc = exc - - -def run_clang_format_diff_wrapper(args, file): - try: - ret = run_clang_format_diff(args, file) - return ret - except DiffError: - raise - except Exception as e: - raise UnexpectedError(f"{file}: {e.__class__.__name__}: {e}", e) - - -def run_clang_format_diff(args, file): - try: - with open(file, encoding="utf-8") as f: - original = f.readlines() - except OSError as exc: - raise DiffError(str(exc)) - invocation = [args.clang_format_executable, file] - - # Use of utf-8 to decode the process output. - # - # Hopefully, this is the correct thing to do. - # - # It's done due to the following assumptions (which may be incorrect): - # - clang-format will returns the bytes read from the files as-is, - # without conversion, and it is already assumed that the files use utf-8. - # - if the diagnostics were internationalized, they would use utf-8: - # > Adding Translations to Clang - # > - # > Not possible yet! - # > Diagnostic strings should be written in UTF-8, - # > the client can translate to the relevant code page if needed. - # > Each translation completely replaces the format string - # > for the diagnostic. - # > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation - - try: - proc = subprocess.Popen( - invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, encoding="utf-8" - ) - except OSError as exc: - raise DiffError(f"Command '{subprocess.list2cmdline(invocation)}' failed to start: {exc}") - proc_stdout = proc.stdout - proc_stderr = proc.stderr - - # hopefully the stderr pipe won't get full and block the process - outs = list(proc_stdout.readlines()) - errs = list(proc_stderr.readlines()) - proc.wait() - if proc.returncode: - raise DiffError( - "Command '{}' returned non-zero exit status {}".format( - subprocess.list2cmdline(invocation), proc.returncode - ), - errs, - ) - return make_diff(file, original, outs), errs - - -def bold_red(s): - return "\x1b[1m\x1b[31m" + s + "\x1b[0m" - - -def colorize(diff_lines): - def bold(s): - return "\x1b[1m" + s + "\x1b[0m" - - def cyan(s): - return "\x1b[36m" + s + "\x1b[0m" - - def green(s): - return "\x1b[32m" + s + "\x1b[0m" - - def red(s): - return "\x1b[31m" + s + "\x1b[0m" - - for line in diff_lines: - if line[:4] in ["--- ", "+++ "]: - yield bold(line) - elif line.startswith("@@ "): - yield cyan(line) - elif line.startswith("+"): - yield green(line) - elif line.startswith("-"): - yield red(line) - else: - yield line - - -def print_diff(diff_lines, use_color): - if use_color: - diff_lines = colorize(diff_lines) - sys.stdout.writelines(diff_lines) - - -def print_trouble(prog, message, use_colors): - error_text = "error:" - if use_colors: - error_text = bold_red(error_text) - print(f"{prog}: {error_text} {message}", file=sys.stderr) - - -def main(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--clang-format-executable", - metavar="EXECUTABLE", - help="path to the clang-format executable", - default="clang-format", - ) - parser.add_argument( - "--extensions", - help=f"comma separated list of file extensions (default: {DEFAULT_EXTENSIONS})", - default=DEFAULT_EXTENSIONS, - ) - parser.add_argument("-r", "--recursive", action="store_true", help="run recursively over directories") - parser.add_argument("files", metavar="file", nargs="+") - parser.add_argument("-q", "--quiet", action="store_true") - parser.add_argument( - "-j", - metavar="N", - type=int, - default=0, - help="run N clang-format jobs in parallel (default number of cpus + 1)", - ) - parser.add_argument( - "--color", default="auto", choices=["auto", "always", "never"], help="show colored diff (default: auto)" - ) - parser.add_argument( - "-e", - "--exclude", - metavar="PATTERN", - action="append", - default=[], - help="exclude paths matching the given glob-like pattern(s) from recursive search", - ) - - args = parser.parse_args() - - # use default signal handling, like diff return SIGINT value on ^C - # https://bugs.python.org/issue14229#msg156446 - signal.signal(signal.SIGINT, signal.SIG_DFL) - try: - signal.SIGPIPE - except AttributeError: - # compatibility, SIGPIPE does not exist on Windows - pass - else: - signal.signal(signal.SIGPIPE, signal.SIG_DFL) - - colored_stdout = False - colored_stderr = False - if args.color == "always": - colored_stdout = True - colored_stderr = True - elif args.color == "auto": - colored_stdout = sys.stdout.isatty() - colored_stderr = sys.stderr.isatty() - - version_invocation = [args.clang_format_executable, "--version"] - try: - subprocess.check_call(version_invocation, stdout=DEVNULL) - except subprocess.CalledProcessError as e: - print_trouble(parser.prog, str(e), use_colors=colored_stderr) - return ExitStatus.TROUBLE - except OSError as e: - print_trouble( - parser.prog, - f"Command '{subprocess.list2cmdline(version_invocation)}' failed to start: {e}", - use_colors=colored_stderr, - ) - return ExitStatus.TROUBLE - - retcode = ExitStatus.SUCCESS - files = list_files( - args.files, recursive=args.recursive, exclude=args.exclude, extensions=args.extensions.split(",") - ) - - if not files: - return - - njobs = args.j - if njobs == 0: - njobs = multiprocessing.cpu_count() + 1 - njobs = min(len(files), njobs) - - if njobs == 1: - # execute directly instead of in a pool, - # less overhead, simpler stacktraces - it = (run_clang_format_diff_wrapper(args, file) for file in files) - pool = None - else: - pool = multiprocessing.Pool(njobs) - it = pool.imap_unordered(partial(run_clang_format_diff_wrapper, args), files) - while True: - try: - outs, errs = next(it) - except StopIteration: - break - except DiffError as e: - print_trouble(parser.prog, str(e), use_colors=colored_stderr) - retcode = ExitStatus.TROUBLE - sys.stderr.writelines(e.errs) - except UnexpectedError as e: - print_trouble(parser.prog, str(e), use_colors=colored_stderr) - sys.stderr.write(e.formatted_traceback) - retcode = ExitStatus.TROUBLE - # stop at the first unexpected error, - # something could be very wrong, - # don't process all files unnecessarily - if pool: - pool.terminate() - break - else: - sys.stderr.writelines(errs) - if outs == []: - continue - if not args.quiet: - print_diff(outs, use_color=colored_stdout) - if retcode == ExitStatus.SUCCESS: - retcode = ExitStatus.DIFF - return retcode - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh deleted file mode 100755 index 8f6b8cb8485..00000000000 --- a/.circleci/unittest/linux/scripts/run_test.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env - -python -m torch.utils.collect_env -pytest --junitxml=test-results/junit.xml -v --durations 20 diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh deleted file mode 100755 index 0574cdff1cf..00000000000 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -# This script is for setting up environment in which unit test is ran. -# To speed up the CI time, the resulting environment is cached. -# -# Do not install PyTorch and torchvision here, otherwise they also get cached. - -set -e - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -# Avoid error: "fatal: unsafe repository" -git config --global --add safe.directory '*' -root_dir="$(git rev-parse --show-toplevel)" -conda_dir="${root_dir}/conda" -env_dir="${root_dir}/env" - -cd "${root_dir}" - -case "$(uname -s)" in - Darwin*) os=MacOSX;; - *) os=Linux -esac - -# 1. Install conda at ./conda -if [ ! -d "${conda_dir}" ]; then - printf "* Installing conda\n" - wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh" - bash ./miniconda.sh -b -f -p "${conda_dir}" -fi -eval "$(${conda_dir}/bin/conda shell.bash hook)" - -# 2. Create test environment at ./env -if [ ! -d "${env_dir}" ]; then - printf "* Creating a test environment\n" - conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" -fi -conda activate "${env_dir}" - -# 3. Install Conda dependencies -printf "* Installing dependencies (except PyTorch)\n" -FFMPEG_PIN="=4.2" -if [[ "${PYTHON_VERSION}" = "3.9" ]]; then - FFMPEG_PIN=">=4.2" -fi - -conda install -y -c pytorch "ffmpeg${FFMPEG_PIN}" -conda env update --file "${this_dir}/environment.yml" --prune diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml deleted file mode 100644 index 0e07ae80d0d..00000000000 --- a/.circleci/unittest/windows/scripts/environment.yml +++ /dev/null @@ -1,19 +0,0 @@ -channels: - - pytorch - - defaults -dependencies: - - pytest - - pytest-cov - - pytest-mock - - pip - - libpng - - jpeg - - ca-certificates - - hdf5 - - setuptools - - pip: - - future - - scipy - - av != 9.1.1 - - dataclasses - - h5py diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh deleted file mode 100644 index cfdff3da6ba..00000000000 --- a/.circleci/unittest/windows/scripts/install.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash - -unset PYTORCH_VERSION -# For unittest, nightly PyTorch is used as the following section, -# so no need to set PYTORCH_VERSION. -# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config. - -set -ex - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" -conda activate ./env - -# TODO, refactor the below logic to make it easy to understand how to get correct cuda_version. -if [ "${CU_VERSION:-}" == cpu ] ; then - cudatoolkit="cpuonly" - version="cpu" -else - if [[ ${#CU_VERSION} -eq 4 ]]; then - CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}" - elif [[ ${#CU_VERSION} -eq 5 ]]; then - CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}" - fi - - cuda_toolkit_pckg="cudatoolkit" - if [[ "$CU_VERSION" == cu116 ]]; then - cuda_toolkit_pckg="cuda" - fi - - echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION" - version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")" - cudatoolkit="${cuda_toolkit_pckg}=${version}" -fi - -printf "Installing PyTorch with %s\n" "${cudatoolkit}" -conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}" - -torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())") -echo torch.cuda.is_available is $torch_cuda - -if [ ! -z "${CUDA_VERSION:-}" ] ; then - if [ "$torch_cuda" == "False" ]; then - echo "torch with cuda installed but torch.cuda.is_available() is False" - exit 1 - fi -fi - -source "$this_dir/set_cuda_envs.sh" - -printf "* Installing torchvision\n" -"$this_dir/vc_env_helper.bat" python setup.py develop diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.circleci/unittest/windows/scripts/install_conda.bat deleted file mode 100644 index 6052ad08b10..00000000000 --- a/.circleci/unittest/windows/scripts/install_conda.bat +++ /dev/null @@ -1 +0,0 @@ -start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda% diff --git a/.circleci/unittest/windows/scripts/post_process.sh b/.circleci/unittest/windows/scripts/post_process.sh deleted file mode 100644 index 5c5cbb758a9..00000000000 --- a/.circleci/unittest/windows/scripts/post_process.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" -conda activate ./env diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.circleci/unittest/windows/scripts/run_test.sh deleted file mode 100644 index 802ad37f511..00000000000 --- a/.circleci/unittest/windows/scripts/run_test.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" -conda activate ./env - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -source "$this_dir/set_cuda_envs.sh" - -python -m torch.utils.collect_env -pytest --junitxml=test-results/junit.xml -v --durations 20 diff --git a/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/.circleci/unittest/windows/scripts/set_cuda_envs.sh deleted file mode 100644 index d1ed415940d..00000000000 --- a/.circleci/unittest/windows/scripts/set_cuda_envs.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash -set -ex - -echo CU_VERSION is "${CU_VERSION}" -echo CUDA_VERSION is "${CUDA_VERSION}" - -# Currenly, CU_VERSION and CUDA_VERSION are not consistent. -# to understand this code, see https://github.com/pytorch/vision/issues/4443 -version="cpu" -if [[ ! -z "${CUDA_VERSION}" ]] ; then - version="$CUDA_VERSION" -else - if [[ ${#CU_VERSION} -eq 5 ]]; then - version="${CU_VERSION:2:2}.${CU_VERSION:4:1}" - fi -fi - -# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi. -# It would exit the shell. One result is cpu tests would not run if the shell exit. -# Unless there's an error, Don't exit. -if [[ "$version" != "cpu" ]]; then - # set cuda envs - export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH" - export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}" - export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}" - - if [ ! -d "$CUDA_PATH" ]; then - echo "$CUDA_PATH" does not exist - exit 1 - fi - - if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then - echo "nvjpeg does not exist" - exit 1 - fi - - # check cuda driver version - for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do - if [[ -x "$path" ]]; then - "$path" || echo "true"; - break - fi - done - - which nvcc - nvcc --version - env | grep CUDA -fi diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh deleted file mode 100644 index 5eeb2e17b48..00000000000 --- a/.circleci/unittest/windows/scripts/setup_env.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# This script is for setting up environment in which unit test is ran. -# To speed up the CI time, the resulting environment is cached. -# -# Do not install PyTorch and torchvision here, otherwise they also get cached. - -set -e - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -root_dir="$(git rev-parse --show-toplevel)" -conda_dir="${root_dir}/conda" -env_dir="${root_dir}/env" - -cd "${root_dir}" - -# 1. Install conda at ./conda -if [ ! -d "${conda_dir}" ]; then - printf "* Installing conda\n" - export tmp_conda="$(echo $conda_dir | tr '/' '\\')" - export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe" - curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O - "$this_dir/install_conda.bat" - unset tmp_conda - unset miniconda_exe -fi - -eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')" - -# 2. Create test environment at ./env -if [ ! -d "${env_dir}" ]; then - printf "* Creating a test environment\n" - conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" -fi -conda activate "${env_dir}" - -# 3. Install Conda dependencies -printf "* Installing dependencies (except PyTorch)\n" -conda env update --file "${this_dir}/environment.yml" --prune - -# 4. Downgrade setuptools on Python 3.7. -# See https://github.com/pytorch/vision/pull/5868 -if [[ "${PYTHON_VERSION}" == '3.7' ]]; then - pip install --upgrade setuptools==58.0.4 -fi diff --git a/.circleci/unittest/windows/scripts/vc_env_helper.bat b/.circleci/unittest/windows/scripts/vc_env_helper.bat deleted file mode 100644 index 9410135677a..00000000000 --- a/.circleci/unittest/windows/scripts/vc_env_helper.bat +++ /dev/null @@ -1,39 +0,0 @@ -@echo on - -set VC_VERSION_LOWER=16 -set VC_VERSION_UPPER=17 - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VS15INSTALLDIR=%%i" - set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" - goto vswhere - ) -) - -:vswhere -if "%VSDEVCMD_ARGS%" == "" ( - call "%VS15VCVARSALL%" x64 || exit /b 1 -) else ( - call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1 -) - -@echo on - -set DISTUTILS_USE_SDK=1 - -set args=%1 -shift -:start -if [%1] == [] goto done -set args=%args% %1 -shift -goto start - -:done -if "%args%" == "" ( - echo Usage: vc_env_helper.bat [command] [args] - echo e.g. vc_env_helper.bat cl /c test.cpp -) - -%args% || exit /b 1 diff --git a/.clang-format b/.clang-format index 6d0ab740db4..95d60445f4a 100644 --- a/.clang-format +++ b/.clang-format @@ -60,9 +60,6 @@ MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 @@ -85,4 +82,11 @@ SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 8 UseTab: Never +--- +Language: ObjC +ColumnLimit: 120 +AlignAfterOpenBracket: Align +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false ... diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index eec93854788..138adf1104e 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -7,3 +7,9 @@ d367a01a18a3ae6bee13d8be3b63fd6a581ea46f # Upgrade usort to 1.0.2 and black to 22.3.0 (#5106) 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f +# Fix unnecessary exploded black formatting (#7709) +a335d916db0694770e8152f41e19195de3134523 +# Renaming: `BoundingBox` -> `BoundingBoxes` (#7778) +332bff937c6711666191880fab57fa2f23ae772e +# Upgrade type hint and others to Python 3.9 (#8814) +a095de183d3811d79ed0db2715e7a1c3162fa19d diff --git a/.gitattributes b/.gitattributes index f9d672d7fb5..22d0452f8d7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,6 +6,3 @@ # To ignore it use below *.ipynb linguist-documentation - -# To exclude autogenerated files from code reviews -.circleci/config.yml linguist-generated=true diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index a073146ebed..ba811554c43 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -48,7 +48,7 @@ body: description: | Please run the following and paste the output below. ```sh - wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py + wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py # For security purposes, please check the contents of collect_env.py before running it. python collect_env.py ``` diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 27d0f2a1f0b..1a3402466f4 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1 +1,10 @@ tracking_issue: 2447 + +# List of workflows that will be re-run in case of failures +# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts +retryable_workflows: +- Build Linux +- Build Macos +- Build M1 +- Build Windows +- Tests diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh new file mode 100755 index 00000000000..4217a9d24be --- /dev/null +++ b/.github/scripts/cmake.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +./.github/scripts/setup-env.sh + +# Activate conda environment +set +x && eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci && set -x + +# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below. +case $(uname) in + Linux) + OS_TYPE=linux + ;; + Darwin) + OS_TYPE=macos + ;; + MSYS*) + OS_TYPE=windows + ;; + *) + echo "Unknown OS type:" $(uname) + exit 1 + ;; +esac + +if [[ $OS_TYPE == macos ]]; then + JOBS=$(sysctl -n hw.logicalcpu) +else + JOBS=$(nproc) +fi + +if [[ $OS_TYPE == linux ]]; then + export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" +fi + +TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))") +if [[ $OS_TYPE == windows ]]; then + PACKAGING_DIR="${PWD}/packaging" + export PATH="${TORCH_PATH}/lib:${PATH}" +fi + +Torch_DIR="${TORCH_PATH}/share/cmake/Torch" +if [[ "${GPU_ARCH_TYPE}" == "cuda" ]]; then + WITH_CUDA=1 +else + WITH_CUDA=0 +fi + +echo '::group::Prepare CMake builds' +mkdir -p cpp_build + +pushd examples/cpp +python script_model.py +mkdir -p build +mv resnet18.pt fasterrcnn_resnet50_fpn.pt build +popd + +# This was only needed for the tracing above +pip uninstall -y torchvision +echo '::endgroup::' + +echo '::group::Build and install libtorchvision' +pushd cpp_build + + +# On macOS, CMake is looking for the library (*.dylib) and the header (*.h) separately. By default, it prefers to load +# the header from other packages that install the library. This easily leads to a mismatch if the library installed +# from conda doesn't have the exact same version. Thus, we need to explicitly set CMAKE_FIND_FRAMEWORK=NEVER to force +# it to not load anything from other installed frameworks. Resources: +# https://stackoverflow.com/questions/36523911/osx-homebrew-cmake-libpng-version-mismatch-issue +# https://cmake.org/cmake/help/latest/variable/CMAKE_FIND_FRAMEWORK.html +cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \ + -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ + -DCMAKE_FIND_FRAMEWORK=NEVER \ + -DCMAKE_INSTALL_PREFIX="${CONDA_PREFIX}" +if [[ $OS_TYPE == windows ]]; then + "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cmake.bat" $JOBS +else + make -j$JOBS + make install +fi + +popd +echo '::endgroup::' + +echo '::group::Build and run C++ example' +pushd examples/cpp/build + +cmake .. -DTorch_DIR="${Torch_DIR}" \ + -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ + -DCMAKE_FIND_FRAMEWORK=NEVER \ + -DUSE_TORCHVISION=ON # Needed for faster-rcnn since it's using torchvision ops like NMS. +if [[ $OS_TYPE == windows ]]; then + "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cpp_example.bat" $JOBS + cd Release + cp ../resnet18.pt . + cp ../fasterrcnn_resnet50_fpn.pt . +else + make -j$JOBS +fi + +./run_model resnet18.pt +./run_model fasterrcnn_resnet50_fpn.pt + +popd +echo '::endgroup::' diff --git a/.github/scripts/export_IS_M1_CONDA_BUILD_JOB.sh b/.github/scripts/export_IS_M1_CONDA_BUILD_JOB.sh new file mode 100755 index 00000000000..1cca56ddc56 --- /dev/null +++ b/.github/scripts/export_IS_M1_CONDA_BUILD_JOB.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +export IS_M1_CONDA_BUILD_JOB=1 diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh new file mode 100755 index 00000000000..e1c5855f31c --- /dev/null +++ b/.github/scripts/setup-env.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +# Prepare conda +set +x && eval "$($(which conda) shell.bash hook)" && set -x + +# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below. +case $(uname) in + Linux) + OS_TYPE=linux + ;; + Darwin) + OS_TYPE=macos + ;; + MSYS*) + OS_TYPE=windows + ;; + *) + echo "Unknown OS type:" $(uname) + exit 1 + ;; +esac + +echo '::group::Create build environment' +# See https://github.com/pytorch/vision/issues/7296 for ffmpeg +conda create \ + --name ci \ + --quiet --yes \ + python="${PYTHON_VERSION}" pip \ + ninja cmake \ + libpng \ + libwebp \ + 'ffmpeg<4.3' +conda activate ci +conda install --quiet --yes libjpeg-turbo -c pytorch +pip install --progress-bar=off --upgrade setuptools==72.1.0 + +# See https://github.com/pytorch/vision/issues/6790 +if [[ "${PYTHON_VERSION}" != "3.11" ]]; then + pip install --progress-bar=off av!=10.0.0 +fi + +echo '::endgroup::' + +if [[ "${OS_TYPE}" == windows && "${GPU_ARCH_TYPE}" == cuda ]]; then + echo '::group::Install VisualStudio CUDA extensions on Windows' + TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/MSBuild/Microsoft/VC/v170/BuildCustomizations" + mkdir -p "${TARGET_DIR}" + cp -r "${CUDA_HOME}/MSBuildExtensions/"* "${TARGET_DIR}" + echo '::endgroup::' +fi + +echo '::group::Install PyTorch' +# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`. +if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then + CHANNEL=test +else + CHANNEL=nightly +fi + +case $GPU_ARCH_TYPE in + cpu) + GPU_ARCH_ID="cpu" + ;; + cuda) + VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//') + GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}" + ;; + *) + echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}" + exit 1 + ;; +esac +PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" +pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + +if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then + python -c "import torch; exit(not torch.cuda.is_available())" +fi +echo '::endgroup::' + +echo '::group::Install TorchVision' +pip install -e . -v --no-build-isolation +echo '::endgroup::' + +echo '::group::Install torchvision-extra-decoders' +# This can be done after torchvision was built +if [[ "$(uname)" == "Linux" && "$(uname -m)" != "aarch64" ]]; then + extra_decoders_channel="--pre --index-url https://download.pytorch.org/whl/nightly/cpu" +else + extra_decoders_channel="" +fi + +pip install torchvision-extra-decoders $extra_decoders_channel +echo '::endgroup::' + +echo '::group::Collect environment information' +conda list +python -m torch.utils.collect_env +echo '::endgroup::' diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh new file mode 100755 index 00000000000..43968762a8b --- /dev/null +++ b/.github/scripts/unittest.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -euo pipefail + +./.github/scripts/setup-env.sh + +# Activate conda environment +eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci + +echo '::group::Install testing utilities' +# TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed +pip install --progress-bar=off "pytest<8" pytest-mock pytest-cov expecttest!=0.2.0 requests +echo '::endgroup::' + +python test/smoke_test.py + +# We explicitly ignore the video tests until we resolve https://github.com/pytorch/vision/issues/8162 +pytest --ignore-glob="*test_video*" --ignore-glob="*test_onnx*" --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 -k "not TestFxFeatureExtraction" diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml new file mode 100644 index 00000000000..13518aba924 --- /dev/null +++ b/.github/workflows/build-cmake.yml @@ -0,0 +1,87 @@ +name: CMake + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +jobs: + linux: + strategy: + matrix: + include: + - runner: linux.12xlarge + gpu-arch-type: cpu + - runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=3.9 + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + ./.github/scripts/cmake.sh + + macos: + strategy: + matrix: + include: + - runner: macos-m1-stable + fail-fast: false + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=3.9 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ${CONDA_RUN} ./.github/scripts/cmake.sh + + windows: + strategy: + matrix: + include: + - runner: windows.4xlarge + gpu-arch-type: cpu + - runner: windows.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/windows_job.yml@main + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=3.9 + export VC_YEAR=2022 + export VSDEVCMD_ARGS="" + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/cmake.sh diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml deleted file mode 100644 index b34dfd8f528..00000000000 --- a/.github/workflows/build-m1-binaries.yml +++ /dev/null @@ -1,157 +0,0 @@ -name: Build on M1 -on: - pull_request: - paths: - - .github/workflows/build-m1-binaries.yml - push: - branches: - - nightly - - main - - release/* - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - workflow_dispatch: -env: - CHANNEL: "nightly" -jobs: - build_wheels: - name: "Build TorchVision M1 wheels" - runs-on: macos-m1-12 - strategy: - matrix: - py_vers: [ "3.8", "3.9", "3.10" ] - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Set Release CHANNEL (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Build TorchVision M1 wheel - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')` - set -ex - . packaging/pkg_helpers.bash - # if we are uploading to test channell, our version consist only of the base: 0.x.x - no date string or suffix added - if [[ $CHANNEL == "test" ]]; then - setup_base_build_version - else - setup_build_version - fi - - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg wheel pkg-config - conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} - conda run -p ${ENV_NAME} python3 -mpip install delocate - conda run -p ${ENV_NAME} python3 setup.py bdist_wheel - export PYTORCH_VERSION="$(conda run -p ${ENV_NAME} python3 -mpip show torch | grep ^Version: | sed 's/Version: *//')" - conda run -p ${ENV_NAME} DYLD_FALLBACK_LIBRARY_PATH="${ENV_NAME}/lib" delocate-wheel -v --ignore-missing-dependencies dist/*.whl - conda env remove -p ${ENV_NAME} - - name: Test wheel - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-test-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - set -ex - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy - conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} - conda run -p ${ENV_NAME} python3 -mpip install dist/*.whl - # Test torch is importable, by changing cwd and running import commands - conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print('torchvision version is ', torchvision.__version__)" - conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torch;import torchvision;print('Is torchvision useable?', all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]))" - conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print(torchvision.io.read_image('${PWD}/gallery/assets/dog1.jpg').shape)" - conda env remove -p ${ENV_NAME} - - name: Upload wheel to GitHub - uses: actions/upload-artifact@v3 - with: - name: torchvision-py${{ matrix.py_vers }}-macos11-m1 - path: dist/ - - name: Upload wheel to S3 - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }} - shell: arch -arch arm64 bash {0} - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - run: | - for pkg in dist/*; do - aws s3 cp "$pkg" "s3://pytorch/whl/${CHANNEL}/cpu/" --acl public-read - done - build_conda: - name: "Build TorchVision M1 conda packages" - runs-on: macos-m1-12 - strategy: - matrix: - py_vers: [ "3.8", "3.9", "3.10" ] - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Set CHANNEL Release (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Install conda-build and purge previous artifacts - shell: arch -arch arm64 bash {0} - run: | - conda install -yq conda-build - conda build purge-all - - - name: Build TorchVision M1 conda package - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PYTHON_VERSION: ${{ matrix.py_vers }} - CU_VERSION: cpu - run: | - set -ex - . packaging/pkg_helpers.bash - - if [[ $CHANNEL == "test" ]]; then - setup_base_build_version - else - setup_build_version - fi - - setup_conda_pytorch_constraint - export SOURCE_ROOT_DIR=$(pwd) - conda build \ - -c defaults \ - $CONDA_CHANNEL_FLAGS \ - --no-anaconda-upload \ - --python "$PYTHON_VERSION" \ - --output-folder=dist/ \ - packaging/torchvision - - name: Upload package to GitHub - uses: actions/upload-artifact@v3 - with: - name: torchvision-py${{ matrix.py_vers }}-macos11-m1-conda - path: dist/ - - name: Upload package to conda - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }} - shell: arch -arch arm64 bash {0} - env: - CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - conda install -yq anaconda-client - set -x - export ANACONDA_PATH=$(conda info --base)/bin - $ANACONDA_PATH/anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload dist/osx-arm64/*.tar.bz2 -u "pytorch-${CHANNEL}" --label main --no-progress --force diff --git a/.github/workflows/build-wheels-aarch64-linux.yml b/.github/workflows/build-wheels-aarch64-linux.yml new file mode 100644 index 00000000000..89948db6397 --- /dev/null +++ b/.github/workflows/build-wheels-aarch64-linux.yml @@ -0,0 +1,54 @@ +name: Build Aarch64 Linux Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +permissions: + id-token: write + contents: read + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main + with: + package-type: wheel + os: linux-aarch64 + test-infra-repository: pytorch/test-infra + test-infra-ref: main + with-cuda: enable + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: main + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + architecture: aarch64 + setup-miniconda: false diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml new file mode 100644 index 00000000000..818f32c102b --- /dev/null +++ b/.github/workflows/build-wheels-linux.yml @@ -0,0 +1,52 @@ +name: Build Linux Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +permissions: + id-token: write + contents: read + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main + with: + package-type: wheel + os: linux + test-infra-repository: pytorch/test-infra + test-infra-ref: main + with-xpu: enable + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: main + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml new file mode 100644 index 00000000000..76709b755e8 --- /dev/null +++ b/.github/workflows/build-wheels-m1.yml @@ -0,0 +1,52 @@ +name: Build M1 Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +permissions: + id-token: write + contents: read + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main + with: + package-type: wheel + os: macos-arm64 + test-infra-repository: pytorch/test-infra + test-infra-ref: main + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@main + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: main + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + runner-type: macos-m1-stable + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml new file mode 100644 index 00000000000..a269aea2604 --- /dev/null +++ b/.github/workflows/build-wheels-windows.yml @@ -0,0 +1,54 @@ +name: Build Windows Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +permissions: + id-token: write + contents: read + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main + with: + package-type: wheel + os: windows + test-infra-repository: pytorch/test-infra + test-infra-ref: main + with-xpu: enable + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + env-script: packaging/windows/internal/vc_env_helper.bat + post-script: "python packaging/wheel/relocate.py" + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: main + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + env-script: ${{ matrix.env-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} diff --git a/.github/workflows/build_wheel_windows_arm64.yml b/.github/workflows/build_wheel_windows_arm64.yml new file mode 100644 index 00000000000..0c578b194ab --- /dev/null +++ b/.github/workflows/build_wheel_windows_arm64.yml @@ -0,0 +1,54 @@ +name: Build Windows ARM64 Wheels + +on: + pull_request: + paths: + - .github/workflows/build_wheel_windows_arm64.yml + push: + branches: + - nightly + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +permissions: + id-token: write + contents: read + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main + with: + package-type: wheel + os: windows-arm64 + test-infra-repository: pytorch/test-infra + test-infra-ref: main + with-cuda: disable + + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + smoke-test-script: test/smoke_test.py + pre-script: packaging/pre_build_script_arm64.sh + package-name: torchvision + architecture: "arm64" + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@main + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: main + pre-script: ${{ matrix.pre-script }} + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + architecture: ${{ matrix.architecture }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 00000000000..8b341622181 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,133 @@ +name: Docs + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + - v[0-9]+.[0-9]+.[0-9] + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + build: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + upload-artifact: docs + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=3.10 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + ./.github/scripts/setup-env.sh + + # Prepare conda + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it + # already links against the one pulled from conda. However, at runtime it pulls from + # /lib64 + # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't + # have to pay attention in all other workflows? + export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" + + cd docs + + echo '::group::Install doc requirements' + pip install --progress-bar=off -r requirements.txt + echo '::endgroup::' + + if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then + echo '::group::Enable version string sanitization' + # This environment variable just has to exist and must not be empty. The actual value is arbitrary. + # See docs/source/conf.py for details + export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1 + echo '::endgroup::' + fi + + # The runner does not have sufficient memory to run with as many processes as there are + # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here. + sed -i -e 's/-j auto/-j 1/' Makefile + make html + + # Below is an imperfect way for us to add "try on Colab" links to all of our gallery examples. + # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in + # build/html/_downloads//.ipynb + # We copy all those ipynb files in a more convenient folder so that we can more easily link to them. + mkdir build/html/_generated_ipynb_notebooks + for file in `find build/html/_downloads`; do + if [[ $file == *.ipynb ]]; then + cp $file build/html/_generated_ipynb_notebooks/ + fi + done + + cp -r build/html "${RUNNER_ARTIFACT_DIR}" + + # On PRs we also want to upload the docs into our S3 bucket for preview. + if [[ ${{ github.event_name == 'pull_request' }} ]]; then + cp -r build/html/* "${RUNNER_DOCS_DIR}" + fi + + upload: + needs: build + if: github.repository == 'pytorch/vision' && github.event_name == 'push' && + ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag') + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: write + with: + repository: pytorch/vision + download-artifact: docs + ref: gh-pages + test-infra-ref: main + script: | + set -euo pipefail + + REF_TYPE=${{ github.ref_type }} + REF_NAME=${{ github.ref_name }} + + if [[ "${REF_TYPE}" == branch ]]; then + TARGET_FOLDER="${REF_NAME}" + elif [[ "${REF_TYPE}" == tag ]]; then + case "${REF_NAME}" in + *-rc*) + echo "Aborting upload since this is an RC tag: ${REF_NAME}" + exit 0 + ;; + *) + # Strip the leading "v" as well as the trailing patch version. For example: + # 'v0.15.2' -> '0.15' + TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/') + ;; + esac + fi + echo "Target Folder: ${TARGET_FOLDER}" + + mkdir -p "${TARGET_FOLDER}" + rm -rf "${TARGET_FOLDER}"/* + mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}" + git add "${TARGET_FOLDER}" || true + + if [[ "${TARGET_FOLDER}" == main ]]; then + mkdir -p _static + rm -rf _static/* + cp -r "${TARGET_FOLDER}"/_static/* _static + git add _static || true + fi + + git config user.name 'pytorchbot' + git config user.email 'soumith+bot@pytorch.org' + git config http.postBuffer 524288000 + git commit -m "auto-generating sphinx docs" || true + git push diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000000..deb3cdff83d --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,81 @@ +name: Lint + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +jobs: + python-source-and-configs: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + test-infra-ref: main + script: | + set -euo pipefail + + echo '::group::Setup environment' + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda create --name ci --quiet --yes python=3.9 pip + conda activate ci + echo '::endgroup::' + + echo '::group::Install lint tools' + pip install --progress-bar=off pre-commit + echo '::endgroup::' + + set +e + pre-commit run --all-files + + if [ $? -ne 0 ]; then + git --no-pager diff + exit 1 + fi + + python-types: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=3.11 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/setup-env.sh + + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + + echo '::group::Install lint tools' + pip install --progress-bar=off "mypy==1.13.0" + echo '::endgroup::' + + echo '::group::Lint Python types' + mypy --install-types --non-interactive --config-file mypy.ini + echo '::endgroup::' + + # bc: + # if: github.event.pull_request + # runs-on: ubuntu-latest + # steps: + # - name: Run BC Lint Action + # uses: pytorch/test-infra/.github/actions/bc-lint@main + # with: + # repo: ${{ github.event.pull_request.head.repo.full_name }} + # base_sha: ${{ github.event.pull_request.base.sha }} + # head_sha: ${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/pr-labels.yml b/.github/workflows/pr-labels.yml index 20c37e4fd88..bf6349ab02e 100644 --- a/.github/workflows/pr-labels.yml +++ b/.github/workflows/pr-labels.yml @@ -8,28 +8,33 @@ on: jobs: is-properly-labeled: runs-on: ubuntu-latest + permissions: + pull-requests: write steps: - name: Set up python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 - name: Install requests run: pip install requests - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Process commit and find merger responsible for labeling id: commit - run: echo "::set-output name=merger::$(python .github/process_commit.py ${{ github.sha }})" + run: | + MERGER=$(python .github/process_commit.py ${{ github.sha }}) + echo "merger=${MERGER}" | tee --append $GITHUB_OUTPUT - name: Ping merger responsible for labeling if necessary if: ${{ steps.commit.outputs.merger != '' }} - uses: mshick/add-pr-comment@v1 + uses: mshick/add-pr-comment@v2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: message: | Hey ${{ steps.commit.outputs.merger }}! - You merged this PR, but no labels were added. The list of valid labels is available at https://github.com/pytorch/vision/blob/main/.github/process_commit.py + You merged this PR, but no labels were added. + The list of valid labels is available at https://github.com/pytorch/vision/blob/main/.github/process_commit.py diff --git a/.github/workflows/prototype-tests-linux-gpu.yml b/.github/workflows/prototype-tests-linux-gpu.yml new file mode 100644 index 00000000000..18881bfacb2 --- /dev/null +++ b/.github/workflows/prototype-tests-linux-gpu.yml @@ -0,0 +1,60 @@ +name: Prototype tests on Linux + +# IMPORTANT: This workflow has been manually disabled from the GitHub interface +# in June 2024. The file is kept for reference in case we ever put this back. + +on: + pull_request: + +jobs: + unittests-prototype: + strategy: + matrix: + python-version: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + runner: ["linux.12xlarge"] + gpu-arch-type: ["cpu"] + include: + - python-version: "3.9" + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + timeout: 120 + script: | + set -euo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + ./.github/scripts/setup-env.sh + + # Prepare conda + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + + echo '::group::Install testing utilities' + pip install --progress-bar=off pytest pytest-mock pytest-cov + echo '::endgroup::' + + # We don't want to run the prototype datasets tests. Since the positional glob into `pytest`, i.e. + # `test/test_prototype*.py` takes the highest priority, neither `--ignore` nor `--ignore-glob` can help us here. + rm test/test_prototype_datasets*.py + pytest \ + -v --durations=25 \ + --cov=torchvision/prototype --cov-report=term-missing \ + --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" \ + test/test_prototype_*.py diff --git a/.github/workflows/prototype-tests.yml b/.github/workflows/prototype-tests.yml deleted file mode 100644 index ff29168d9a7..00000000000 --- a/.github/workflows/prototype-tests.yml +++ /dev/null @@ -1,44 +0,0 @@ -name: tests - -on: - pull_request: - -jobs: - prototype: - strategy: - matrix: - os: - - ubuntu-latest - - windows-latest - - macos-latest - fail-fast: false - - runs-on: ${{ matrix.os }} - - steps: - - name: Set up python - uses: actions/setup-python@v3 - with: - python-version: 3.7 - - - name: Upgrade system packages - run: python -m pip install --upgrade pip setuptools wheel - - - name: Checkout repository - uses: actions/checkout@v3 - - - name: Install PyTorch nightly builds - run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu/ - - - name: Install torchvision - run: pip install --progress-bar=off --no-build-isolation --editable . - - - name: Install other prototype dependencies - run: pip install --progress-bar=off scipy pycocotools h5py iopath - - - name: Install test requirements - run: pip install --progress-bar=off pytest pytest-mock - - - name: Run prototype tests - shell: bash - run: pytest -vvv --durations=20 test/test_prototype_*.py diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml deleted file mode 100644 index 1e5f79f82fd..00000000000 --- a/.github/workflows/test-m1.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Unit-tests on M1 -on: - pull_request: - push: - branches: - - nightly - - main - - release/* - workflow_dispatch: -env: - CHANNEL: "nightly" -jobs: - tests: - name: "Unit-tests on M1" - runs-on: macos-m1-12 - strategy: - matrix: - py_vers: [ "3.8"] - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set Release CHANNEL (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Install TorchVision - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - . ~/miniconda3/etc/profile.d/conda.sh - # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')` - export PATH=~/miniconda3/bin:$PATH - set -ex - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy - conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} - conda run -p ${ENV_NAME} python3 setup.py develop - conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock av - - name: Run tests - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - . ~/miniconda3/etc/profile.d/conda.sh - set -ex - conda run -p ${ENV_NAME} --no-capture-output python3 -u -mpytest -v --tb=long --durations 20 - conda env remove -p ${ENV_NAME} diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml index ecc283cac27..3cba2ef59d8 100644 --- a/.github/workflows/tests-schedule.yml +++ b/.github/workflows/tests-schedule.yml @@ -18,14 +18,20 @@ jobs: - name: Set up python uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.9 - name: Upgrade system packages run: python -m pip install --upgrade pip setuptools wheel + - name: SSL + run: python -c 'import ssl; print(ssl.OPENSSL_VERSION)' + - name: Checkout repository uses: actions/checkout@v2 + - name: TODO REMOVE THIS! Install non pre-release version of mpmath. + run: pip install "mpmath<1.4" + - name: Install torch nightly build run: pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html @@ -33,7 +39,7 @@ jobs: run: pip install --no-build-isolation --editable . - name: Install all optional dataset requirements - run: pip install scipy pycocotools lmdb requests + run: pip install scipy pycocotools lmdb gdown - name: Install tests requirements run: pip install pytest diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 00000000000..5e0ed381b01 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,184 @@ +name: Tests + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +jobs: + unittests-linux: + strategy: + matrix: + python-version: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + runner: ["linux.12xlarge"] + gpu-arch-type: ["cpu"] + include: + - python-version: 3.9 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + timeout: 120 + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/unittest.sh + + unittests-macos: + strategy: + matrix: + python-version: + - "3.9" + - "3.10" + # TODO put back 3.11 (See blame) + # - "3.11" + - "3.12" + runner: ["macos-m1-stable"] + fail-fast: false + uses: pytorch/test-infra/.github/workflows/macos_job.yml@main + with: + repository: pytorch/vision + timeout: 240 + runner: ${{ matrix.runner }} + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ${CONDA_RUN} ./.github/scripts/unittest.sh + + unittests-windows: + strategy: + matrix: + python-version: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + runner: ["windows.4xlarge"] + gpu-arch-type: ["cpu"] + # TODO: put GPU testing back + # include: + # - python-version: "3.9" + # runner: windows.g5.4xlarge.nvidia.gpu + # gpu-arch-type: cuda + # gpu-arch-version: "11.8" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/windows_job.yml@main + permissions: + id-token: write + contents: read + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + timeout: 120 + test-infra-ref: main + script: | + set -euxo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export VC_YEAR=2022 + export VSDEVCMD_ARGS="" + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/unittest.sh + + # onnx: + # uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + # permissions: + # id-token: write + # contents: read + # with: + # repository: pytorch/vision + # test-infra-ref: main + # script: | + # set -euo pipefail + + # export PYTHON_VERSION=3.10 + # export GPU_ARCH_TYPE=cpu + # export GPU_ARCH_VERSION='' + + # ./.github/scripts/setup-env.sh + + # # Prepare conda + # CONDA_PATH=$(which conda) + # eval "$(${CONDA_PATH} shell.bash hook)" + # conda activate ci + + # echo '::group::Install ONNX' + # pip install --progress-bar=off onnx onnxruntime + # echo '::endgroup::' + + # echo '::group::Install testing utilities' + # pip install --progress-bar=off pytest "numpy<2" + # echo '::endgroup::' + + # echo '::group::Run ONNX tests' + # pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py + # echo '::endgroup::' + + unittests-extended: + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + if: contains(github.event.pull_request.labels.*.name, 'run-extended') + with: + repository: pytorch/vision + test-infra-ref: main + script: | + set -euo pipefail + + export PYTHON_VERSION=3.9 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/setup-env.sh + + # Prepare conda + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + + echo '::group::Pre-download model weights' + pip install --progress-bar=off aiohttp aiofiles tqdm + python scripts/download_model_urls.py + echo '::endgroup::' + + echo '::group::Install testing utilities' + # TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed + pip install --progress-bar=off "pytest<8" + echo '::endgroup::' + + echo '::group::Run extended unittests' + export PYTORCH_TEST_WITH_EXTENDED=1 + pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py + echo '::endgroup::' diff --git a/.gitignore b/.gitignore index f16b54061e0..c2d4d2a1c42 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ docs/source/auto_examples/ docs/source/gen_modules/ docs/source/generated/ docs/source/models/generated/ +docs/source/sg_execution_times.rst # pytorch-sphinx-theme gets installed here docs/src @@ -42,3 +43,5 @@ xcuserdata/ # direnv .direnv .envrc + +scripts/release_notes/data.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 463a97359ab..73a0b329112 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,20 +6,22 @@ repos: - id: check-toml - id: check-yaml exclude: packaging/.* + args: + - --allow-multiple-documents - id: mixed-line-ending args: [--fix=lf] - id: end-of-file-fixer - repo: https://github.com/omnilib/ufmt - rev: v1.3.2 + rev: v1.3.3 hooks: - id: ufmt additional_dependencies: - black == 22.3.0 - usort == 1.0.2 - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 hooks: - id: flake8 args: [--config=setup.cfg] @@ -28,3 +30,12 @@ repos: rev: 6.1.1 hooks: - id: pydocstyle + + - repo: https://github.com/pre-commit/mirrors-clang-format + rev: v18.1.3 + hooks: + - id: clang-format + name: clang-format + files: \.(cpp|hpp|c|h|cu)$ + types: [file] + exclude: ^torchvision/csrc/io/image/cpu/giflib/ diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000000..37db28b2bad --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,14 @@ +cff-version: 1.2.0 +title: "TorchVision: PyTorch's Computer Vision library" +message: >- + If you find TorchVision useful in your work, please + consider citing the following BibTeX entry. +type: software +authors: + - given-names: TorchVision maintainers and contributors +url: "https://github.com/pytorch/vision" +license: "BSD-3-Clause" +date-released: "2016-11-06" +journal: "GitHub repository" +publisher: "GitHub" +key: "torchvision2016" diff --git a/CMakeLists.txt b/CMakeLists.txt index 85b878307cf..f2430559909 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,23 +1,29 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.18) project(torchvision) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) file(STRINGS version.txt TORCHVISION_VERSION) option(WITH_CUDA "Enable CUDA support" OFF) +option(WITH_MPS "Enable MPS support" OFF) option(WITH_PNG "Enable features requiring LibPNG." ON) option(WITH_JPEG "Enable features requiring LibJPEG." ON) -option(USE_PYTHON "Link to Python when building" OFF) +# Libwebp is disabled by default, which means enabling it from cmake is largely +# untested. Since building from cmake is very low pri anyway, this is OK. If +# you're a user and you need this, please open an issue (and a PR!). +option(WITH_WEBP "Enable features requiring LibWEBP." OFF) +# Same here +option(WITH_AVIF "Enable features requiring LibAVIF." OFF) if(WITH_CUDA) enable_language(CUDA) add_definitions(-D__CUDA_NO_HALF_OPERATORS__) add_definitions(-DWITH_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") - # CUDA-11.x can not be compiled using C++14 standard on Windows - string(REGEX MATCH "^[0-9]+" CUDA_MAJOR ${CMAKE_CUDA_COMPILER_VERSION}) - if(${CUDA_MAJOR} GREATER 10 AND MSVC) - set(CMAKE_CXX_STANDARD 17) - endif() +endif() + +if(WITH_MPS) + enable_language(OBJC OBJCXX) + add_definitions(-DWITH_MPS) endif() find_package(Torch REQUIRED) @@ -32,9 +38,14 @@ if (WITH_JPEG) find_package(JPEG REQUIRED) endif() -if (USE_PYTHON) - add_definitions(-DUSE_PYTHON) - find_package(Python3 REQUIRED COMPONENTS Development) +if (WITH_WEBP) + add_definitions(-DWEBP_FOUND) + find_package(WEBP REQUIRED) +endif() + +if (WITH_AVIF) + add_definitions(-DAVIF_FOUND) + find_package(AVIF REQUIRED) endif() function(CUDA_CONVERT_FLAGS EXISTING_TARGET) @@ -79,11 +90,14 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) set(TVCPP torchvision/csrc) -list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCPP}/models ${TVCPP}/ops +list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCPP}/io/image/cpu/giflib ${TVCPP}/models ${TVCPP}/ops ${TVCPP}/ops/autograd ${TVCPP}/ops/cpu ${TVCPP}/io/image/cuda) if(WITH_CUDA) list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast) endif() +if(WITH_MPS) + list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps) +endif() FOREACH(DIR ${ALLOW_LISTED}) file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*) @@ -92,6 +106,12 @@ ENDFOREACH() add_library(${PROJECT_NAME} SHARED ${ALL_SOURCES}) target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES}) +if(WITH_MPS) + find_library(metal NAMES Metal) + find_library(foundation NAMES Foundation) + target_link_libraries(${PROJECT_NAME} PRIVATE ${metal} ${foundation}) +endif() + if (WITH_PNG) target_link_libraries(${PROJECT_NAME} PRIVATE ${PNG_LIBRARY}) endif() @@ -100,8 +120,12 @@ if (WITH_JPEG) target_link_libraries(${PROJECT_NAME} PRIVATE ${JPEG_LIBRARIES}) endif() -if (USE_PYTHON) - target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python) +if (WITH_WEBP) + target_link_libraries(${PROJECT_NAME} PRIVATE ${WEBP_LIBRARIES}) +endif() + +if (WITH_AVIF) + target_link_libraries(${PROJECT_NAME} PRIVATE ${AVIF_LIBRARIES}) endif() set_target_properties(${PROJECT_NAME} PROPERTIES @@ -118,6 +142,14 @@ if (WITH_JPEG) include_directories(${JPEG_INCLUDE_DIRS}) endif() +if (WITH_WEBP) + include_directories(${WEBP_INCLUDE_DIRS}) +endif() + +if (WITH_AVIF) + include_directories(${AVIF_INCLUDE_DIRS}) +endif() + set(TORCHVISION_CMAKECONFIG_INSTALL_DIR "share/cmake/TorchVision" CACHE STRING "install path for TorchVisionConfig.cmake") configure_package_config_file(cmake/TorchVisionConfig.cmake.in diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3eedb6261a4..d3cc923e268 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,22 +4,22 @@ We want to make contributing to this project as easy and transparent as possible ## TL;DR -We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. +We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. Your contributions may fall into the following categories: -- It helps the project if you could +- It helps the project if you could - Report issues you're facing - - Give a :+1: on issues that others reported and that are relevant to you + - Give a :+1: on issues that others reported and that are relevant to you - Answering queries on the issue tracker, investigating bugs are very valuable contributions to the project. -- You would like to improve the documentation. This is no less important than improving the library itself! +- You would like to improve the documentation. This is no less important than improving the library itself! If you find a typo in the documentation, do not hesitate to submit a GitHub pull request. - If you would like to fix a bug - please pick one from the [list of open issues labelled as "help wanted"](https://github.com/pytorch/vision/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) - comment on the issue that you want to work on this issue - - send a PR with your fix, see below. + - send a PR with your fix, see below. - If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us. @@ -30,30 +30,51 @@ clear and has sufficient instructions to be able to reproduce the issue. ## Development installation -### Install PyTorch Nightly + +### Dependencies + +Start by installing the **nightly** build of PyTorch following the [official +instructions](https://pytorch.org/get-started/locally/). Note that the official +instructions may ask you to install torchvision itself. If you are doing development +on torchvision, you should not install prebuilt torchvision packages. + +**Optionally**, install `libpng`, `libjpeg-turbo` and `libwebp` if you want to enable +support for +native encoding / decoding of PNG, JPEG and WebP formats in +[torchvision.io](https://pytorch.org/vision/stable/io.html#image): ```bash -conda install pytorch -c pytorch-nightly -# or with pip (see https://pytorch.org/get-started/locally/) -# pip install numpy -# pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html +conda install libpng libjpeg-turbo libwebp -c pytorch ``` -### Install Torchvision +Note: you can use the `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY` +environment variables to tell the build system where to find those libraries if +they are in specific locations. Take a look at +[setup.py](https://github.com/pytorch/vision/blob/main/setup.py) for more +details. + +### Clone and install torchvision ```bash git clone https://github.com/pytorch/vision.git cd vision -python setup.py develop +pip install -e . -v --no-build-isolation # leave out the -e switch if you don't care about development. # or, for OSX -# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop -# for C++ debugging, please use DEBUG=1 -# DEBUG=1 python setup.py develop -pip install flake8 typing mypy pytest pytest-mock scipy +# MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ pip install -e . -v --no-build-isolation +# for C++ debugging, use DEBUG=1 +# DEBUG=1 pip install -e . -v --no-build-isolation ``` -You may also have to install `libpng-dev` and `libjpeg-turbo8-dev` libraries: -```bash -conda install libpng jpeg + +By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force +building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image. + +We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the +`--no-build-isolation` flag. + +#### Other development dependencies (some of these are needed to run tests): + +``` +pip install expecttest flake8 typing mypy pytest pytest-mock scipy requests ``` ## Development Process @@ -66,12 +87,12 @@ If you plan to modify the code or documentation, please follow the steps below: 4. Ensure the test suite passes. 5. Make sure your code passes the formatting checks (see below). -For more details about pull requests, -please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). +For more details about pull requests, +please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). -If you would like to contribute a new model, please see [here](#New-model). +If you would like to contribute a new model, please see [here](#New-architecture-or-improved-model-weights). -If you would like to contribute a new dataset, please see [here](#New-dataset). +If you would like to contribute a new dataset, please see [here](#New-dataset). ### Code formatting and typing @@ -83,7 +104,7 @@ Instead of relying directly on `black` however, we rely on [ufmt](https://github.com/omnilib/ufmt), for compatibility reasons with Facebook internal infrastructure. -To format your code, install `ufmt` with `pip install ufmt==1.3.2 black==22.3.0 usort==1.0.2` and use e.g.: +To format your code, install `ufmt` with `pip install ufmt==1.3.3 black==22.3.0 usort==1.0.2` and use e.g.: ```bash ufmt format torchvision @@ -126,8 +147,10 @@ mypy --config-file mypy.ini ### Unit tests -If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific -test: +Before running tests make sure to install [test dependencies](#other-development-dependencies-some-of-these-are-needed-to-run-tests). + +If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific +test: ```bash pytest test/ -vvv -k # e.g. pytest test/test_transforms.py -vvv -k test_center_crop @@ -136,7 +159,7 @@ pytest test/ -vvv -k If you would like to run all tests: ```bash pytest test -vvv -``` +``` Tests that require internet access should be in `test/test_internet.py`. @@ -189,21 +212,23 @@ with "transforms" in their name. ### New architecture or improved model weights Please refer to the guidelines in [Contributing to Torchvision - Models](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md). - + ### New dataset -More details on how to add a new dataset will be provided later. Please, do not send any PR with a new dataset without discussing +Please, do not send any PR with a new dataset without discussing it in an issue as, most likely, it will not be accepted. ### Pull Request -If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on -different operation systems, python versions and hardwares. +If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on +different operating systems, python versions and hardware. -For more details about pull requests workflow, +For more details about pull requests workflow, please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). ## License By contributing to Torchvision, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. + +Contributors are also required to [sign our Contributor License Agreement](https://code.facebook.com/cla). diff --git a/CONTRIBUTING_MODELS.md b/CONTRIBUTING_MODELS.md index 82845e6579a..390a25a0f89 100644 --- a/CONTRIBUTING_MODELS.md +++ b/CONTRIBUTING_MODELS.md @@ -20,13 +20,13 @@ So, before starting any work and submitting a PR there are a few critical things ### 1. Preparation work -- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether or not this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it. +- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it. - If the decision is to include the new model, then please create a new ticket which will be used for all design and implementation discussions prior to the PR. One of the TorchVision maintainers will reach out at this stage and this will be your POC from this point onwards in order to provide support, guidance and regular feedback. ### 2. Implement the model -Please take a look at existing models in TorchVision to get familiar with the idioms. Also please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1. Example of things to take into account: +Please take a look at existing models in TorchVision to get familiar with the idioms. Also, please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1. Example of things to take into account: - The implementation should be as close as possible to the canonical implementation/paper - The PR must include the code implementation, documentation and tests @@ -34,7 +34,7 @@ Please take a look at existing models in TorchVision to get familiar with the id - The weights need to reproduce closely the results of the paper in terms of accuracy, even though the final weights to be deployed will be those trained by the TorchVision maintainers - The PR description should include commands/configuration used to train the model, so that the TorchVision maintainers can easily run them to verify the implementation and generate the final model to be released - Make sure we re-use existing components as much as possible (inheritance) -- New primitives (transforms, losses, etc) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer +- New primitives (transforms, losses, etc.) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer - Please take a look at the detailed [implementation and documentation guidelines](https://github.com/pytorch/vision/issues/5319) for a fine grain list of things not to be missed ### 3. Train the model with reference scripts diff --git a/MANIFEST.in b/MANIFEST.in index 75f238c0a2c..9e45188df35 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -include README.rst +include README.md include LICENSE recursive-exclude * __pycache__ diff --git a/README.md b/README.md new file mode 100644 index 00000000000..7846c4462e6 --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +# torchvision + +[![total torchvision downloads](https://pepy.tech/badge/torchvision)](https://pepy.tech/project/torchvision) +[![documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/vision/stable/index.html) + +The torchvision package consists of popular datasets, model architectures, and common image transformations for computer +vision. + +## Installation + +Please refer to the [official +instructions](https://pytorch.org/get-started/locally/) to install the stable +versions of `torch` and `torchvision` on your system. + +To build source, refer to our [contributing +page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation). + +The following is the corresponding `torchvision` versions and supported Python +versions. + +| `torch` | `torchvision` | Python | +| ------------------ | ------------------ | ------------------- | +| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.12` | +| `2.8` | `0.23` | `>=3.9`, `<=3.13` | +| `2.7` | `0.22` | `>=3.9`, `<=3.13` | +| `2.6` | `0.21` | `>=3.9`, `<=3.12` | + +
+ older versions + +| `torch` | `torchvision` | Python | +|---------|-------------------|---------------------------| +| `2.5` | `0.20` | `>=3.9`, `<=3.12` | +| `2.4` | `0.19` | `>=3.8`, `<=3.12` | +| `2.3` | `0.18` | `>=3.8`, `<=3.12` | +| `2.2` | `0.17` | `>=3.8`, `<=3.11` | +| `2.1` | `0.16` | `>=3.8`, `<=3.11` | +| `2.0` | `0.15` | `>=3.8`, `<=3.11` | +| `1.13` | `0.14` | `>=3.7.2`, `<=3.10` | +| `1.12` | `0.13` | `>=3.7`, `<=3.10` | +| `1.11` | `0.12` | `>=3.7`, `<=3.10` | +| `1.10` | `0.11` | `>=3.6`, `<=3.9` | +| `1.9` | `0.10` | `>=3.6`, `<=3.9` | +| `1.8` | `0.9` | `>=3.6`, `<=3.9` | +| `1.7` | `0.8` | `>=3.6`, `<=3.9` | +| `1.6` | `0.7` | `>=3.6`, `<=3.8` | +| `1.5` | `0.6` | `>=3.5`, `<=3.8` | +| `1.4` | `0.5` | `==2.7`, `>=3.5`, `<=3.8` | +| `1.3` | `0.4.2` / `0.4.3` | `==2.7`, `>=3.5`, `<=3.7` | +| `1.2` | `0.4.1` | `==2.7`, `>=3.5`, `<=3.7` | +| `1.1` | `0.3` | `==2.7`, `>=3.5`, `<=3.7` | +| `<=1.0` | `0.2` | `==2.7`, `>=3.5`, `<=3.7` | + +
+ +## Image Backends + +Torchvision currently supports the following image backends: + +- torch tensors +- PIL images: + - [Pillow](https://python-pillow.org/) + - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD. + +Read more in in our [docs](https://pytorch.org/vision/stable/transforms.html). + +# Using the models on C++ + +Refer to [example/cpp](https://github.com/pytorch/vision/tree/main/examples/cpp). + +**DISCLAIMER**: the `libtorchvision` library includes the torchvision +custom ops as well as most of the C++ torchvision APIs. Those APIs do not come +with any backward-compatibility guarantees and may change from one version to +the next. Only the Python APIs are stable and with backward-compatibility +guarantees. So, if you need stability within a C++ environment, your best bet is +to export the Python APIs via torchscript. + +## Documentation + +You can find the API documentation on the pytorch website: + +## Contributing + +See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out. + +## Disclaimer on Datasets + +This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, +vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to +determine whether you have permission to use the dataset under the dataset's license. + +If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset +to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML +community! + +## Pre-trained Model License + +The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the +dataset used for training. It is your responsibility to determine whether you have permission to use the models for your +use case. + +More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See +[SWAG LICENSE](https://github.com/facebookresearch/SWAG/blob/main/LICENSE) for additional details. + +## Citing TorchVision + +If you find TorchVision useful in your work, please consider citing the following BibTeX entry: + +```bibtex +@software{torchvision2016, + title = {TorchVision: PyTorch's Computer Vision library}, + author = {TorchVision maintainers and contributors}, + year = 2016, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/pytorch/vision}} +} +``` diff --git a/README.rst b/README.rst deleted file mode 100644 index c3605cc3c95..00000000000 --- a/README.rst +++ /dev/null @@ -1,198 +0,0 @@ -torchvision -=========== - -.. image:: https://pepy.tech/badge/torchvision - :target: https://pepy.tech/project/torchvision - -.. image:: https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v - :target: https://pytorch.org/vision/stable/index.html - - -The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision. - - -Installation -============ - -We recommend Anaconda as Python package management system. Please refer to `pytorch.org `_ -for the detail of PyTorch (``torch``) installation. The following is the corresponding ``torchvision`` versions and -supported Python versions. - -+--------------------------+--------------------------+---------------------------------+ -| ``torch`` | ``torchvision`` | ``python`` | -+==========================+==========================+=================================+ -| ``main`` / ``nightly`` | ``main`` / ``nightly`` | ``>=3.7``, ``<=3.10`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.12.0`` | ``0.13.0`` | ``>=3.7``, ``<=3.10`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.11.0`` | ``0.12.0`` | ``>=3.7``, ``<=3.10`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.10.2`` | ``0.11.3`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.10.1`` | ``0.11.2`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.10.0`` | ``0.11.1`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.9.1`` | ``0.10.1`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.9.0`` | ``0.10.0`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.8.2`` | ``0.9.2`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.8.1`` | ``0.9.1`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.8.0`` | ``0.9.0`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.7.1`` | ``0.8.2`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.7.0`` | ``0.8.1`` | ``>=3.6``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.7.0`` | ``0.8.0`` | ``>=3.6``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.6.0`` | ``0.7.0`` | ``>=3.6``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.5.1`` | ``0.6.1`` | ``>=3.5``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.5.0`` | ``0.6.0`` | ``>=3.5``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.4.0`` | ``0.5.0`` | ``==2.7``, ``>=3.5``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.3.1`` | ``0.4.2`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.3.0`` | ``0.4.1`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.2.0`` | ``0.4.0`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.1.0`` | ``0.3.0`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``<=1.0.1`` | ``0.2.2`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ - -Anaconda: - -.. code:: bash - - conda install torchvision -c pytorch - -pip: - -.. code:: bash - - pip install torchvision - -From source: - -.. code:: bash - - python setup.py install - # or, for OSX - # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install - - -We don't officially support building from source using ``pip``, but *if* you do, -you'll need to use the ``--no-build-isolation`` flag. -In case building TorchVision from source fails, install the nightly version of PyTorch following -the linked guide on the `contributing page `_ and retry the install. - -By default, GPU support is built if CUDA is found and ``torch.cuda.is_available()`` is true. -It's possible to force building GPU support by setting ``FORCE_CUDA=1`` environment variable, -which is useful when building a docker image. - -Image Backend -============= -Torchvision currently supports the following image backends: - -* `Pillow`_ (default) - -* `Pillow-SIMD`_ - a **much faster** drop-in replacement for Pillow with SIMD. If installed will be used as the default. - -* `accimage`_ - if installed can be activated by calling :code:`torchvision.set_image_backend('accimage')` - -* `libpng`_ - can be installed via conda :code:`conda install libpng` or any of the package managers for debian-based and RHEL-based Linux distributions. - -* `libjpeg`_ - can be installed via conda :code:`conda install jpeg` or any of the package managers for debian-based and RHEL-based Linux distributions. `libjpeg-turbo`_ can be used as well. - -**Notes:** ``libpng`` and ``libjpeg`` must be available at compilation time in order to be available. Make sure that it is available on the standard library locations, -otherwise, add the include and library paths in the environment variables ``TORCHVISION_INCLUDE`` and ``TORCHVISION_LIBRARY``, respectively. - -.. _libpng : http://www.libpng.org/pub/png/libpng.html -.. _Pillow : https://python-pillow.org/ -.. _Pillow-SIMD : https://github.com/uploadcare/pillow-simd -.. _accimage: https://github.com/pytorch/accimage -.. _libjpeg: http://ijg.org/ -.. _libjpeg-turbo: https://libjpeg-turbo.org/ - -Video Backend -============= -Torchvision currently supports the following video backends: - -* `pyav`_ (default) - Pythonic binding for ffmpeg libraries. - -.. _pyav : https://github.com/PyAV-Org/PyAV - -* video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any conflicting version of ffmpeg installed. Currently, this is only supported on Linux. - -.. code:: bash - - conda install -c conda-forge ffmpeg - python setup.py install - - -Using the models on C++ -======================= -TorchVision provides an example project for how to use the models on C++ using JIT Script. - -Installation From source: - -.. code:: bash - - mkdir build - cd build - # Add -DWITH_CUDA=on support for the CUDA if needed - cmake .. - make - make install - -Once installed, the library can be accessed in cmake (after properly configuring ``CMAKE_PREFIX_PATH``) via the :code:`TorchVision::TorchVision` target: - -.. code:: rest - - find_package(TorchVision REQUIRED) - target_link_libraries(my-target PUBLIC TorchVision::TorchVision) - -The ``TorchVision`` package will also automatically look for the ``Torch`` package and add it as a dependency to ``my-target``, -so make sure that it is also available to cmake via the ``CMAKE_PREFIX_PATH``. - -For an example setup, take a look at ``examples/cpp/hello_world``. - -Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any Python -dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link to Python. This -can be done by passing ``-DUSE_PYTHON=on`` to CMake. - -TorchVision Operators ---------------------- -In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that you -:code:`#include ` in your project. - -Documentation -============= -You can find the API documentation on the pytorch website: https://pytorch.org/vision/stable/index.html - -Contributing -============ - -See the `CONTRIBUTING `_ file for how to help out. - -Disclaimer on Datasets -====================== - -This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license. - -If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community! - -Pre-trained Model License -========================= - -The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case. - -More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE `_ for additional details. diff --git a/android/README.md b/android/README.md new file mode 100644 index 00000000000..788c83f26de --- /dev/null +++ b/android/README.md @@ -0,0 +1,3 @@ +## Status + +The Android demo of TorchVision is currently unmaintained, untested and likely out-of-date. diff --git a/android/build.gradle b/android/build.gradle index f28ba9112ff..f7995a07f5b 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -14,7 +14,7 @@ allprojects { androidSupportAppCompatV7Version = "28.0.0" fbjniJavaOnlyVersion = "0.0.3" - soLoaderNativeLoaderVersion = "0.10.4" + soLoaderNativeLoaderVersion = "0.10.5" pytorchAndroidVersion = "1.12" } diff --git a/android/gradle.properties b/android/gradle.properties index 1b6b275f63f..8204b73b051 100644 --- a/android/gradle.properties +++ b/android/gradle.properties @@ -1,6 +1,6 @@ ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64 -VERSION_NAME=0.14.0-SNAPSHOT +VERSION_NAME=0.15.0-SNAPSHOT GROUP=org.pytorch MAVEN_GROUP=org.pytorch SONATYPE_STAGING_PROFILE=orgpytorch diff --git a/android/ops/CMakeLists.txt b/android/ops/CMakeLists.txt index ad42adbfa71..fb8d4348e8e 100644 --- a/android/ops/CMakeLists.txt +++ b/android/ops/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.4.1) set(TARGET torchvision_ops) project(${TARGET} CXX) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) string(APPEND CMAKE_CXX_FLAGS " -DMOBILE") diff --git a/benchmarks/encoding_decoding.py b/benchmarks/encoding_decoding.py new file mode 100644 index 00000000000..0cafdb2d8a6 --- /dev/null +++ b/benchmarks/encoding_decoding.py @@ -0,0 +1,99 @@ +import os +import platform +import statistics + +import torch +import torch.utils.benchmark as benchmark +import torchvision + + +def print_machine_specs(): + print("Processor:", platform.processor()) + print("Platform:", platform.platform()) + print("Logical CPUs:", os.cpu_count()) + print(f"\nCUDA device: {torch.cuda.get_device_name()}") + print(f"Total Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") + + +def get_data(): + transform = torchvision.transforms.Compose( + [ + torchvision.transforms.PILToTensor(), + ] + ) + path = os.path.join(os.getcwd(), "data") + testset = torchvision.datasets.Places365( + root="./data", download=not os.path.exists(path), transform=transform, split="val" + ) + testloader = torch.utils.data.DataLoader( + testset, batch_size=1000, shuffle=False, num_workers=1, collate_fn=lambda batch: [r[0] for r in batch] + ) + return next(iter(testloader)) + + +def run_encoding_benchmark(decoded_images): + results = [] + for device in ["cpu", "cuda"]: + decoded_images_device = [t.to(device=device) for t in decoded_images] + for size in [1, 100, 1000]: + for num_threads in [1, 12, 24]: + for stmt, strat in zip( + [ + "[torchvision.io.encode_jpeg(img) for img in decoded_images_device_trunc]", + "torchvision.io.encode_jpeg(decoded_images_device_trunc)", + ], + ["unfused", "fused"], + ): + decoded_images_device_trunc = decoded_images_device[:size] + t = benchmark.Timer( + stmt=stmt, + setup="import torchvision", + globals={"decoded_images_device_trunc": decoded_images_device_trunc}, + label="Image Encoding", + sub_label=f"{device.upper()} ({strat}): {stmt}", + description=f"{size} images", + num_threads=num_threads, + ) + results.append(t.blocked_autorange()) + compare = benchmark.Compare(results) + compare.print() + + +def run_decoding_benchmark(encoded_images): + results = [] + for device in ["cpu", "cuda"]: + for size in [1, 100, 1000]: + for num_threads in [1, 12, 24]: + for stmt, strat in zip( + [ + f"[torchvision.io.decode_jpeg(img, device='{device}') for img in encoded_images_trunc]", + f"torchvision.io.decode_jpeg(encoded_images_trunc, device='{device}')", + ], + ["unfused", "fused"], + ): + encoded_images_trunc = encoded_images[:size] + t = benchmark.Timer( + stmt=stmt, + setup="import torchvision", + globals={"encoded_images_trunc": encoded_images_trunc}, + label="Image Decoding", + sub_label=f"{device.upper()} ({strat}): {stmt}", + description=f"{size} images", + num_threads=num_threads, + ) + results.append(t.blocked_autorange()) + compare = benchmark.Compare(results) + compare.print() + + +if __name__ == "__main__": + print_machine_specs() + decoded_images = get_data() + mean_h, mean_w = statistics.mean(t.shape[-2] for t in decoded_images), statistics.mean( + t.shape[-1] for t in decoded_images + ) + print(f"\nMean image size: {int(mean_h)}x{int(mean_w)}") + run_encoding_benchmark(decoded_images) + encoded_images_cuda = torchvision.io.encode_jpeg([img.cuda() for img in decoded_images]) + encoded_images_cpu = [img.cpu() for img in encoded_images_cuda] + run_decoding_benchmark(encoded_images_cpu) diff --git a/cmake/TorchVisionConfig.cmake.in b/cmake/TorchVisionConfig.cmake.in index 9e92bc3b512..7f7e78817fa 100644 --- a/cmake/TorchVisionConfig.cmake.in +++ b/cmake/TorchVisionConfig.cmake.in @@ -46,13 +46,5 @@ if(@WITH_JPEG@) target_compile_definitions(${PN}::${PN} INTERFACE JPEG_FOUND) endif() -if (@USE_PYTHON@) - if(NOT TARGET Python3::Python) - find_package(Python3 COMPONENTS Development) - endif() - target_link_libraries(torch INTERFACE Python3::Python) - target_compile_definitions(${PN}::${PN} INTERFACE USE_PYTHON) -endif() - endif() endif() diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake index d42ea4c9232..935c57f11b9 100644 --- a/cmake/iOS.cmake +++ b/cmake/iOS.cmake @@ -10,11 +10,11 @@ # SIMULATOR - used to build for the Simulator platforms, which have an x86 arch. # # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder -# By default this location is automatcially chosen based on the IOS_PLATFORM value above. +# By default this location is automatically chosen based on the IOS_PLATFORM value above. # If set manually, it will override the default location and force the user of a particular Developer Platform # # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder -# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. +# By default this location is automatically chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. # In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path. # If set manually, this will force the use of a specific SDK version @@ -100,7 +100,7 @@ if(IOS_DEPLOYMENT_TARGET) set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") endif() -# Hidden visibilty is required for cxx on iOS +# Hidden visibility is required for cxx on iOS set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}") set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden") diff --git a/docs/Makefile b/docs/Makefile index 389a07a604e..f462ff22303 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -33,6 +33,7 @@ clean: rm -rf $(SOURCEDIR)/auto_examples/ # sphinx-gallery rm -rf $(SOURCEDIR)/gen_modules/ # sphinx-gallery rm -rf $(SOURCEDIR)/generated/ # autosummary + rm -rf $(SOURCEDIR)/models/generated # autosummary .PHONY: help Makefile docset diff --git a/docs/requirements.txt b/docs/requirements.txt index 1ff0c828042..2a50d9b8f45 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,8 @@ matplotlib numpy sphinx-copybutton>=0.3.1 -sphinx-gallery>=0.9.0 +sphinx-gallery>=0.11.1 sphinx==5.0.0 tabulate -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +pycocotools diff --git a/docs/source/_static/css/custom_torchvision.css b/docs/source/_static/css/custom_torchvision.css index bdc4071c1aa..07346d7b03f 100644 --- a/docs/source/_static/css/custom_torchvision.css +++ b/docs/source/_static/css/custom_torchvision.css @@ -21,3 +21,15 @@ article.pytorch-article .reference.download.internal, article.pytorch-article .s .table-weights p { margin-bottom: 0.2rem !important; } + +/* Fix for Sphinx gallery 0.11 +See https://github.com/sphinx-gallery/sphinx-gallery/issues/990 +*/ +article.pytorch-article .sphx-glr-thumbnails .sphx-glr-thumbcontainer { + width: unset; + margin-right: 0; + margin-left: 0; +} +article.pytorch-article div.section div.wy-table-responsive tbody td { + width: 50%; +} diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py index 925894df5c5..8871f6debbb 100644 --- a/docs/source/beta_status.py +++ b/docs/source/beta_status.py @@ -4,11 +4,12 @@ class BetaStatus(Directive): has_content = True + text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed." + node = nodes.warning def run(self): - api_name = " ".join(self.content) - text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed." - return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))] + text = self.text.format(api_name=" ".join(self.content)) + return [self.node("", nodes.paragraph("", "", nodes.Text(text)))] def setup(app): diff --git a/docs/source/conf.py b/docs/source/conf.py index 231d3cad416..26771a7b711 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -29,6 +29,7 @@ import pytorch_sphinx_theme import torchvision import torchvision.models as M +from sphinx_gallery.sorting import ExplicitOrder from tabulate import tabulate sys.path.append(os.path.abspath(".")) @@ -55,11 +56,66 @@ "beta_status", ] +# We override sphinx-gallery's example header to prevent sphinx-gallery from +# creating a note at the top of the renderred notebook. +# https://github.com/sphinx-gallery/sphinx-gallery/blob/451ccba1007cc523f39cbcc960ebc21ca39f7b75/sphinx_gallery/gen_rst.py#L1267-L1271 +# This is because we also want to add a link to google Colab, so we write our own note in each example. +from sphinx_gallery import gen_rst + +gen_rst.EXAMPLE_HEADER = """ +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "{0}" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_{1}: + +""" + + +class CustomGalleryExampleSortKey: + # See https://sphinx-gallery.github.io/stable/configuration.html#sorting-gallery-examples + # and https://github.com/sphinx-gallery/sphinx-gallery/blob/master/sphinx_gallery/sorting.py + def __init__(self, src_dir): + self.src_dir = src_dir + + transforms_subsection_order = [ + "plot_transforms_getting_started.py", + "plot_transforms_illustrations.py", + "plot_transforms_e2e.py", + "plot_cutmix_mixup.py", + "plot_rotated_box_transforms.py", + "plot_custom_transforms.py", + "plot_tv_tensors.py", + "plot_custom_tv_tensors.py", + ] + + def __call__(self, filename): + if "gallery/transforms" in self.src_dir: + try: + return self.transforms_subsection_order.index(filename) + except ValueError as e: + raise ValueError( + "Looks like you added an example in gallery/transforms? " + "You need to specify its order in docs/source/conf.py. Look for CustomGalleryExampleSortKey." + ) from e + else: + # For other subsections we just sort alphabetically by filename + return filename + + sphinx_gallery_conf = { "examples_dirs": "../../gallery/", # path to your example scripts "gallery_dirs": "auto_examples", # path to where to save gallery generated output + "subsection_order": ExplicitOrder(["../../gallery/transforms", "../../gallery/others"]), "backreferences_dir": "gen_modules/backreferences", "doc_module": ("torchvision",), + "remove_config_comments": True, + "ignore_pattern": "helpers.py", + "within_subsection_order": CustomGalleryExampleSortKey, } napoleon_use_ivar = True @@ -88,17 +144,15 @@ # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -# -# The short X.Y version. -version = "main (" + torchvision.__version__ + " )" -# The full version, including alpha/beta/rc tags. -release = "main" -VERSION = os.environ.get("VERSION", None) -if VERSION: +# version: The short X.Y version. +# release: The full version, including alpha/beta/rc tags. +if os.environ.get("TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS", None): # Turn 1.11.0aHASH into 1.11 (major.minor only) - version = ".".join(version.split(".")[:2]) + version = release = ".".join(torchvision.__version__.split(".")[:2]) html_title = " ".join((project, version, "documentation")) - release = version +else: + version = f"main ({torchvision.__version__})" + release = "main" # The language for content autogenerated by Sphinx. Refer to documentation @@ -138,7 +192,7 @@ "logo_only": True, "pytorch_project": "docs", "navigation_with_keys": True, - "analytics_id": "UA-117752657-2", + "analytics_id": "GTM-T8XT4PS", } html_logo = "_static/img/pytorch-logo-dark.svg" @@ -318,7 +372,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines): used within the autoclass directive. """ - if obj.__name__.endswith(("_Weights", "_QuantizedWeights")): + if getattr(obj, "__name__", "").endswith(("_Weights", "_QuantizedWeights")): if len(obj) == 0: lines[:] = ["There are no available pre-trained weights."] @@ -330,8 +384,8 @@ def inject_weight_metadata(app, what, name, obj, options, lines): f"``weights='DEFAULT'`` or ``weights='{str(list(obj)[0]).split('.')[1]}'``.", ] - if obj.__doc__ != "An enumeration.": - # We only show the custom enum doc if it was overriden. The default one from Python is "An enumeration" + if obj.__doc__ is not None and obj.__doc__ != "An enumeration.": + # We only show the custom enum doc if it was overridden. The default one from Python is "An enumeration" lines.append("") lines.append(obj.__doc__) @@ -362,6 +416,13 @@ def inject_weight_metadata(app, what, name, obj, options, lines): max_visible = 3 v_sample = ", ".join(v[:max_visible]) v = f"{v_sample}, ... ({len(v)-max_visible} omitted)" if len(v) > max_visible else v_sample + elif k == "_ops": + v = f"{v:.2f}" + k = "GIPS" if obj.__name__.endswith("_QuantizedWeights") else "GFLOPS" + elif k == "_file_size": + k = "File size" + v = f"{v:.1f} MB" + table.append((str(k), str(v))) table = tabulate(table, tablefmt="rst") lines += [".. rst-class:: table-weights"] # Custom CSS class, see custom_torchvision.css @@ -385,19 +446,27 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern if exclude_patterns is not None: weights = [w for w in weights if all(p not in str(w) for p in exclude_patterns)] + ops_name = "GIPS" if "QuantizedWeights" in weights_endswith else "GFLOPS" + metrics_keys, metrics_names = zip(*metrics) - column_names = ["Weight"] + list(metrics_names) + ["Params", "Recipe"] + column_names = ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Recipe"] # Final column order column_names = [f"**{name}**" for name in column_names] # Add bold - content = [ - ( + content = [] + for w in weights: + row = [ f":class:`{w} <{type(w).__name__}>`", *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys), f"{w.meta['num_params']/1e6:.1f}M", + f"{w.meta['_ops']:.2f}", f"`link <{w.meta['recipe']}>`__", - ) - for w in weights - ] + ] + + content.append(row) + + column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 2 + ["10"] + widths_table = " ".join(column_widths) + table = tabulate(content, headers=column_names, tablefmt="rst") generated_dir = Path("generated") @@ -405,7 +474,7 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern with open(generated_dir / f"{table_name}_table.rst", "w+") as table_file: table_file.write(".. rst-class:: table-weights\n") # Custom CSS class, see custom_torchvision.css table_file.write(".. table::\n") - table_file.write(f" :widths: 100 {'20 ' * len(metrics_names)} 20 10\n\n") + table_file.write(f" :widths: {widths_table} \n\n") table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n") diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 7641139daed..3caa7434e20 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -1,3 +1,5 @@ +.. _datasets: + Datasets ======== @@ -25,6 +27,15 @@ All the datasets have almost similar API. They all have two common arguments: ``transform`` and ``target_transform`` to transform the input and target respectively. You can also create your own datasets using the provided :ref:`base classes `. +.. warning:: + + When a dataset object is created with ``download=True``, the files are first + downloaded and extracted in the root directory. This download logic is not + multi-process safe, so it may lead to conflicts / race conditions if it is + run within a distributed setting. In distributed mode, we recommend creating + a dummy dataset object to trigger the download logic *before* setting up + distributed mode. + Image classification ~~~~~~~~~~~~~~~~~~~~ @@ -52,6 +63,7 @@ Image classification GTSRB INaturalist ImageNet + Imagenette KMNIST LFWPeople LSUN @@ -80,7 +92,6 @@ Image detection or segmentation CocoDetection CelebA Cityscapes - GTSRB Kitti OxfordIIITPet SBDataset @@ -111,11 +122,13 @@ Stereo Matching CarlaStereo Kitti2012Stereo Kitti2015Stereo + CREStereo FallingThingsStereo SceneFlowStereo SintelStereo InStereo2k ETH3DStereo + Middlebury2014Stereo Image pairs ~~~~~~~~~~~ @@ -145,9 +158,16 @@ Video classification HMDB51 Kinetics - Kinetics400 UCF101 +Video prediction +~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + :template: class_dataset.rst + + MovingMNIST .. _base_classes_datasets: @@ -161,3 +181,12 @@ Base classes for custom datasets DatasetFolder ImageFolder VisionDataset + +Transforms v2 +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + wrap_dataset_for_transforms_v2 diff --git a/docs/source/index.rst b/docs/source/index.rst index 79dbebdd047..dc5fdefaefb 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,6 +32,7 @@ architectures, and common image transformations for computer vision. :caption: Package Reference transforms + tv_tensors models datasets utils diff --git a/docs/source/io.rst b/docs/source/io.rst index 258a1ee16dc..478321a4e6d 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -1,88 +1,119 @@ -Reading/Writing images and videos -================================= +Decoding / Encoding images and videos +===================================== .. currentmodule:: torchvision.io -The :mod:`torchvision.io` package provides functions for performing IO -operations. They are currently specific to reading and writing video and -images. +The :mod:`torchvision.io` module provides utilities for decoding and encoding +images and videos. -Video ------ +Image Decoding +-------------- -.. autosummary:: - :toctree: generated/ - :template: function.rst +Torchvision currently supports decoding JPEG, PNG, WEBP, GIF, AVIF, and HEIC +images. JPEG decoding can also be done on CUDA GPUs. - read_video - read_video_timestamps - write_video +The main entry point is the :func:`~torchvision.io.decode_image` function, which +you can use as an alternative to ``PIL.Image.open()``. It will decode images +straight into image Tensors, thus saving you the conversion and allowing you to +run transforms/preproc natively on tensors. +.. code:: -Fine-grained video API ----------------------- + from torchvision.io import decode_image -In addition to the :mod:`read_video` function, we provide a high-performance -lower-level API for more fine-grained control compared to the :mod:`read_video` function. -It does all this whilst fully supporting torchscript. + img = decode_image("path_to_image", mode="RGB") + img.dtype # torch.uint8 -.. betastatus:: fine-grained video API + # Or + raw_encoded_bytes = ... # read encoded bytes from your file system + img = decode_image(raw_encoded_bytes, mode="RGB") -.. autosummary:: - :toctree: generated/ - :template: class.rst - - VideoReader +:func:`~torchvision.io.decode_image` will automatically detect the image format, +and call the corresponding decoder (except for HEIC and AVIF images, see details +in :func:`~torchvision.io.decode_avif` and :func:`~torchvision.io.decode_heic`). +You can also use the lower-level format-specific decoders which can be more +powerful, e.g. if you want to encode/decode JPEGs on CUDA. -Example of inspecting a video: +.. autosummary:: + :toctree: generated/ + :template: function.rst -.. code:: python + decode_image + decode_jpeg + decode_png + decode_webp + decode_avif + decode_heic + decode_gif - import torchvision - video_path = "path to a test video" - # Constructor allocates memory and a threaded decoder - # instance per video. At the moment it takes two arguments: - # path to the video file, and a wanted stream. - reader = torchvision.io.VideoReader(video_path, "video") +.. autosummary:: + :toctree: generated/ + :template: class.rst - # The information about the video can be retrieved using the - # `get_metadata()` method. It returns a dictionary for every stream, with - # duration and other relevant metadata (often frame rate) - reader_md = reader.get_metadata() + ImageReadMode - # metadata is structured as a dict of dicts with following structure - # {"stream_type": {"attribute": [attribute per stream]}} - # - # following would print out the list of frame rates for every present video stream - print(reader_md["video"]["fps"]) +Obsolete decoding function: - # we explicitly select the stream we would like to operate on. In - # the constructor we select a default video stream, but - # in practice, we can set whichever stream we would like - video.set_current_stream("video:0") +.. autosummary:: + :toctree: generated/ + :template: function.rst + read_image -Image ------ +Image Encoding +-------------- -.. autosummary:: - :toctree: generated/ - :template: class.rst +For encoding, JPEG (cpu and CUDA) and PNG are supported. - ImageReadMode .. autosummary:: :toctree: generated/ :template: function.rst - read_image - decode_image encode_jpeg - decode_jpeg write_jpeg encode_png - decode_png write_png + +IO operations +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + read_file write_file + +Video - DEPREACTED +------------------ + +.. warning:: + + DEPRECATED: All the video decoding and encoding capabilities of torchvision + are deprecated from version 0.22 and will be removed in version 0.24. We + recommend that you migrate to + `TorchCodec `__, where we'll + consolidate the future decoding/encoding capabilities of PyTorch + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + read_video + read_video_timestamps + write_video + + +**Fine-grained video API** + +In addition to the :mod:`read_video` function, we provide a high-performance +lower-level API for more fine-grained control compared to the :mod:`read_video` function. +It does all this whilst fully supporting torchscript. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + VideoReader diff --git a/docs/source/models.rst b/docs/source/models.rst index 57eda6d38a5..d0096aaf854 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -120,13 +120,12 @@ behavior, such as batch normalization. To switch between these modes, use # Set model to eval mode model.eval() -Model Registration Mechanism ----------------------------- - -.. betastatus:: registration mechanism +Listing and retrieving available models +--------------------------------------- -As of v0.14, TorchVision offers a new model registration mechanism which allows retreaving models -and weights by their names. Here are a few examples on how to use them: +As of v0.14, TorchVision offers a new mechanism which allows listing and +retrieving models and weights by their names. Here are a few examples on how to +use them: .. code:: python @@ -148,7 +147,7 @@ and weights by their names. Here are a few examples on how to use them: weights_enum2 = get_model_weights(torchvision.models.quantization.mobilenet_v3_large) assert weights_enum == weights_enum2 -Here are the available public methods of the model registration mechanism: +Here are the available public functions to retrieve models and their corresponding weights: .. currentmodule:: torchvision.models .. autosummary:: @@ -173,7 +172,11 @@ Most pre-trained models can be accessed directly via PyTorch Hub without having model = torch.hub.load("pytorch/vision", "resnet50", weights="IMAGENET1K_V2") # Option 2: passing weights param as enum - weights = torch.hub.load("pytorch/vision", "get_weight", weights="ResNet50_Weights.IMAGENET1K_V2") + weights = torch.hub.load( + "pytorch/vision", + "get_weight", + weights="ResNet50_Weights.IMAGENET1K_V2", + ) model = torch.hub.load("pytorch/vision", "resnet50", weights=weights) You can also retrieve all the available weights of a specific model via PyTorch Hub by doing: @@ -207,6 +210,7 @@ weights: models/efficientnetv2 models/googlenet models/inception + models/maxvit models/mnasnet models/mobilenetv2 models/mobilenetv3 @@ -226,10 +230,10 @@ Here is an example of how to use the pre-trained image classification models: .. code:: python - from torchvision.io import read_image + from torchvision.io import decode_image from torchvision.models import resnet50, ResNet50_Weights - img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") + img = decode_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") # Step 1: Initialize model with the best available weights weights = ResNet50_Weights.DEFAULT @@ -283,10 +287,10 @@ Here is an example of how to use the pre-trained quantized image classification .. code:: python - from torchvision.io import read_image + from torchvision.io import decode_image from torchvision.models.quantization import resnet50, ResNet50_QuantizedWeights - img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") + img = decode_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") # Step 1: Initialize model with the best available weights weights = ResNet50_QuantizedWeights.DEFAULT @@ -339,11 +343,11 @@ Here is an example of how to use the pre-trained semantic segmentation models: .. code:: python - from torchvision.io.image import read_image + from torchvision.io.image import decode_image from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights from torchvision.transforms.functional import to_pil_image - img = read_image("gallery/assets/dog1.jpg") + img = decode_image("gallery/assets/dog1.jpg") # Step 1: Initialize model with the best available weights weights = FCN_ResNet50_Weights.DEFAULT @@ -411,12 +415,12 @@ Here is an example of how to use the pre-trained object detection models: .. code:: python - from torchvision.io.image import read_image + from torchvision.io.image import decode_image from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights from torchvision.utils import draw_bounding_boxes from torchvision.transforms.functional import to_pil_image - img = read_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") + img = decode_image("test/assets/encode_jpeg/grace_hopper_517x606.jpg") # Step 1: Initialize model with the best available weights weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT @@ -517,6 +521,7 @@ pre-trained weights: models/video_mvit models/video_resnet models/video_s3d + models/video_swin_transformer | diff --git a/docs/source/models/alexnet.rst b/docs/source/models/alexnet.rst index 080c241983b..8e94b4eeed9 100644 --- a/docs/source/models/alexnet.rst +++ b/docs/source/models/alexnet.rst @@ -14,7 +14,7 @@ and is based on `One weird trick for parallelizing convolutional neural networks Model builders -------------- -The following model builders can be used to instanciate an AlexNet model, with or +The following model builders can be used to instantiate an AlexNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.alexnet.AlexNet`` base class. Please refer to the `source code diff --git a/docs/source/models/efficientnet.rst b/docs/source/models/efficientnet.rst index 4df547b3cbd..cbc9718959a 100644 --- a/docs/source/models/efficientnet.rst +++ b/docs/source/models/efficientnet.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate an EfficientNet model, with or +The following model builders can be used to instantiate an EfficientNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source code diff --git a/docs/source/models/efficientnetv2.rst b/docs/source/models/efficientnetv2.rst index 05c953b1327..3066c28ebd4 100644 --- a/docs/source/models/efficientnetv2.rst +++ b/docs/source/models/efficientnetv2.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate an EfficientNetV2 model, with or +The following model builders can be used to instantiate an EfficientNetV2 model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source code diff --git a/docs/source/models/fcos.rst b/docs/source/models/fcos.rst index 1bcc4267678..085f26549b8 100644 --- a/docs/source/models/fcos.rst +++ b/docs/source/models/fcos.rst @@ -3,7 +3,7 @@ FCOS .. currentmodule:: torchvision.models.detection -The RetinaNet model is based on the `FCOS: Fully Convolutional One-Stage Object Detection +The FCOS model is based on the `FCOS: Fully Convolutional One-Stage Object Detection `__ paper. .. betastatus:: detection module @@ -12,7 +12,7 @@ Model builders -------------- The following model builders can be used to instantiate a FCOS model, with or -without pre-trained weights. All the model buidlers internally rely on the +without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.fcos.FCOS`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/googlenet.rst b/docs/source/models/googlenet.rst index ed4f1345e23..91ea03ddf3d 100644 --- a/docs/source/models/googlenet.rst +++ b/docs/source/models/googlenet.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate a GoogLeNet model, with or +The following model builders can be used to instantiate a GoogLeNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.googlenet.GoogLeNet`` base class. Please refer to the `source code diff --git a/docs/source/models/googlenet_quant.rst b/docs/source/models/googlenet_quant.rst index acb2737b52b..4358389b3e5 100644 --- a/docs/source/models/googlenet_quant.rst +++ b/docs/source/models/googlenet_quant.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate a quantized GoogLeNet +The following model builders can be used to instantiate a quantized GoogLeNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.quantization.googlenet.QuantizableGoogLeNet`` base class. Please refer to the `source code diff --git a/docs/source/models/inception.rst b/docs/source/models/inception.rst index 72aa9724d41..e162eef5d30 100644 --- a/docs/source/models/inception.rst +++ b/docs/source/models/inception.rst @@ -10,7 +10,7 @@ Computer Vision `__ paper. Model builders -------------- -The following model builders can be used to instanciate an InceptionV3 model, with or +The following model builders can be used to instantiate an InceptionV3 model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.inception.Inception3`` base class. Please refer to the `source code `_ for diff --git a/docs/source/models/inception_quant.rst b/docs/source/models/inception_quant.rst index 397fd10df3c..d26f1ab09da 100644 --- a/docs/source/models/inception_quant.rst +++ b/docs/source/models/inception_quant.rst @@ -10,7 +10,7 @@ Computer Vision `__ paper. Model builders -------------- -The following model builders can be used to instanciate a quantized Inception +The following model builders can be used to instantiate a quantized Inception model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.quantization.inception.QuantizableInception3`` base class. Please refer to the `source code diff --git a/docs/source/models/maxvit.rst b/docs/source/models/maxvit.rst new file mode 100644 index 00000000000..29aaaaab334 --- /dev/null +++ b/docs/source/models/maxvit.rst @@ -0,0 +1,23 @@ +MaxVit +=============== + +.. currentmodule:: torchvision.models + +The MaxVit transformer models are based on the `MaxViT: Multi-Axis Vision Transformer `__ +paper. + + +Model builders +-------------- + +The following model builders can be used to instantiate an MaxVit model with and without pre-trained weights. +All the model builders internally rely on the ``torchvision.models.maxvit.MaxVit`` +base class. Please refer to the `source code +`_ for +more details about this class. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + maxvit_t diff --git a/docs/source/models/mnasnet.rst b/docs/source/models/mnasnet.rst index e31b4aca1b6..fd9ea511585 100644 --- a/docs/source/models/mnasnet.rst +++ b/docs/source/models/mnasnet.rst @@ -11,7 +11,7 @@ Search for Mobile `__ paper. Model builders -------------- -The following model builders can be used to instanciate an MNASNet model. +The following model builders can be used to instantiate an MNASNet model. All the model builders internally rely on the ``torchvision.models.mnasnet.MNASNet`` base class. Please refer to the `source code diff --git a/docs/source/models/retinanet.rst b/docs/source/models/retinanet.rst index 8613ae9aaab..910692ef3a5 100644 --- a/docs/source/models/retinanet.rst +++ b/docs/source/models/retinanet.rst @@ -12,7 +12,7 @@ Model builders -------------- The following model builders can be used to instantiate a RetinaNet model, with or -without pre-trained weights. All the model buidlers internally rely on the +without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.retinanet.RetinaNet`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/ssd.rst b/docs/source/models/ssd.rst index 7d73b234a28..68b0bb224df 100644 --- a/docs/source/models/ssd.rst +++ b/docs/source/models/ssd.rst @@ -12,7 +12,7 @@ The SSD model is based on the `SSD: Single Shot MultiBox Detector Model builders -------------- -The following model builders can be used to instanciate a SSD model, with or +The following model builders can be used to instantiate a SSD model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.SSD`` base class. Please refer to the `source code diff --git a/docs/source/models/ssdlite.rst b/docs/source/models/ssdlite.rst index bac1575c966..7701d1c9f9f 100644 --- a/docs/source/models/ssdlite.rst +++ b/docs/source/models/ssdlite.rst @@ -6,7 +6,7 @@ SSDlite The SSDLite model is based on the `SSD: Single Shot MultiBox Detector `__, `Searching for MobileNetV3 `__ and `MobileNetV2: Inverted Residuals and Linear -Bottlenecks __` papers. +Bottlenecks `__ papers. .. betastatus:: detection module @@ -17,7 +17,7 @@ The following model builders can be used to instantiate a SSD Lite model, with o without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.ssd.SSD`` base class. Please refer to the `source code -`_ for +`_ for more details about this class. .. autosummary:: diff --git a/docs/source/models/swin_transformer.rst b/docs/source/models/swin_transformer.rst index 35b52987954..b302f5bd79d 100644 --- a/docs/source/models/swin_transformer.rst +++ b/docs/source/models/swin_transformer.rst @@ -15,7 +15,7 @@ Model builders -------------- The following model builders can be used to instantiate an SwinTransformer model (original and V2) with and without pre-trained weights. -All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` +All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst index a9fa9aabfb1..77b5686927c 100644 --- a/docs/source/models/vgg.rst +++ b/docs/source/models/vgg.rst @@ -11,7 +11,7 @@ Model builders -------------- The following model builders can be used to instantiate a VGG model, with or -without pre-trained weights. All the model buidlers internally rely on the +without pre-trained weights. All the model builders internally rely on the ``torchvision.models.vgg.VGG`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/video_swin_transformer.rst b/docs/source/models/video_swin_transformer.rst new file mode 100644 index 00000000000..e31e69759b4 --- /dev/null +++ b/docs/source/models/video_swin_transformer.rst @@ -0,0 +1,27 @@ +Video SwinTransformer +===================== + +.. currentmodule:: torchvision.models.video + +The Video SwinTransformer model is based on the `Video Swin Transformer `__ paper. + +.. betastatus:: video module + + +Model builders +-------------- + +The following model builders can be used to instantiate a VideoResNet model, with or +without pre-trained weights. All the model builders internally rely on the +``torchvision.models.video.swin_transformer.SwinTransformer3d`` base class. Please refer to the `source +code +`_ for +more details about this class. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + swin3d_t + swin3d_s + swin3d_b diff --git a/docs/source/training_references.rst b/docs/source/training_references.rst index fc22ac5eba6..4b5e43109c2 100644 --- a/docs/source/training_references.rst +++ b/docs/source/training_references.rst @@ -19,9 +19,9 @@ guarantees. In general, these scripts rely on the latest (not yet released) pytorch version or the latest torchvision version. This means that to use them, **you might need -to install the latest pytorch and torchvision versions**, with e.g.:: +to install the latest pytorch and torchvision versions** following the `official +instructions `_. - conda install pytorch torchvision -c pytorch-nightly If you need to rely on an older stable version of pytorch or torchvision, e.g. torchvision 0.10, then it's safer to use the scripts from that corresponding diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 5909b68966b..44b4cc3aaa5 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -1,127 +1,572 @@ .. _transforms: -Transforming and augmenting images -================================== +Transforming images, videos, boxes and more +=========================================== .. currentmodule:: torchvision.transforms -Transforms are common image transformations available in the -``torchvision.transforms`` module. They can be chained together using -:class:`Compose`. -Most transform classes have a function equivalent: :ref:`functional -transforms ` give fine-grained control over the -transformations. -This is useful if you have to build a more complex transformation pipeline -(e.g. in the case of segmentation tasks). - -Most transformations accept both `PIL `_ -images and tensor images, although some transformations are :ref:`PIL-only -` and some are :ref:`tensor-only -`. The :ref:`conversion_transforms` may be used to -convert to and from PIL images. - -The transformations that accept tensor images also accept batches of tensor -images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a -number of channels, ``H`` and ``W`` are image height and width. A batch of -Tensor Images is a tensor of ``(B, C, H, W)`` shape, where ``B`` is a number -of images in the batch. +Torchvision supports common computer vision transformations in the +``torchvision.transforms.v2`` module. Transforms can be used to transform and +augment data, for both training or inference. The following objects are +supported: + +- Images as pure tensors, :class:`~torchvision.tv_tensors.Image` or PIL image +- Videos as :class:`~torchvision.tv_tensors.Video` +- Axis-aligned and rotated bounding boxes as :class:`~torchvision.tv_tensors.BoundingBoxes` +- Segmentation and detection masks as :class:`~torchvision.tv_tensors.Mask` +- KeyPoints as :class:`~torchvision.tv_tensors.KeyPoints`. + +.. code:: python + + # Image Classification + import torch + from torchvision.transforms import v2 + + H, W = 32, 32 + img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8) + + transforms = v2.Compose([ + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomHorizontalFlip(p=0.5), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + img = transforms(img) + +.. code:: python + + # Detection (re-using imports and transforms from above) + from torchvision import tv_tensors + + img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8) + boxes = torch.randint(0, H // 2, size=(3, 4)) + boxes[:, 2:] += boxes[:, :2] + boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) + + # The same transforms can be used! + img, boxes = transforms(img, boxes) + # And you can pass arbitrary input structures + output_dict = transforms({"image": img, "boxes": boxes}) + +Transforms are typically passed as the ``transform`` or ``transforms`` argument +to the :ref:`Datasets `. + +Start here +---------- + +Whether you're new to Torchvision transforms, or you're already experienced with +them, we encourage you to start with +:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` in +order to learn more about what can be done with the new v2 transforms. + +Then, browse the sections in below this page for general information and +performance tips. The available transforms and functionals are listed in the +:ref:`API reference `. + +More information and tutorials can also be found in our :ref:`example gallery +`, e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py` +or :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`. + +.. _conventions: + +Supported input types and conventions +------------------------------------- + +Most transformations accept both `PIL `_ images +and tensor inputs. Both CPU and CUDA tensors are supported. +The result of both backends (PIL or Tensors) should be very +close. In general, we recommend relying on the tensor backend :ref:`for +performance `. The :ref:`conversion transforms +` may be used to convert to and from PIL images, or for +converting dtypes and ranges. + +Tensor image are expected to be of shape ``(C, H, W)``, where ``C`` is the +number of channels, and ``H`` and ``W`` refer to height and width. Most +transforms support batched tensor input. A batch of Tensor images is a tensor of +shape ``(N, C, H, W)``, where ``N`` is a number of images in the batch. The +:ref:`v2 ` transforms generally accept an arbitrary number of leading +dimensions ``(..., C, H, W)`` and can handle batched images or batched videos. + +.. _range_and_dtype: + +Dtype and expected value range +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The expected range of the values of a tensor image is implicitly defined by the tensor dtype. Tensor images with a float dtype are expected to have -values in ``[0, 1)``. Tensor images with an integer dtype are expected to +values in ``[0, 1]``. Tensor images with an integer dtype are expected to have values in ``[0, MAX_DTYPE]`` where ``MAX_DTYPE`` is the largest value -that can be represented in that dtype. +that can be represented in that dtype. Typically, images of dtype +``torch.uint8`` are expected to have values in ``[0, 255]``. -Randomized transformations will apply the same transformation to all the -images of a given batch, but they will produce different transformations -across calls. For reproducible transformations across calls, you may use -:ref:`functional transforms `. +Use :class:`~torchvision.transforms.v2.ToDtype` to convert both the dtype and +range of the inputs. -The following examples illustrate the use of the available transforms: +.. _v1_or_v2: - * :ref:`sphx_glr_auto_examples_plot_transforms.py` +V1 or V2? Which one should I use? +--------------------------------- - .. figure:: ../source/auto_examples/images/sphx_glr_plot_transforms_001.png - :align: center - :scale: 65% +**TL;DR** We recommend using the ``torchvision.transforms.v2`` transforms +instead of those in ``torchvision.transforms``. They're faster and they can do +more things. Just change the import and you should be good to go. Moving +forward, new features and improvements will only be considered for the v2 +transforms. + +In Torchvision 0.15 (March 2023), we released a new set of transforms available +in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of +advantages compared to the v1 ones (in ``torchvision.transforms``): + +- They can transform images **and also** bounding boxes, masks, videos and + keypoints. This provides support for tasks beyond image classification: + detection, segmentation, video classification, pose estimation, etc. See + :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` + and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`. +- They support more transforms like :class:`~torchvision.transforms.v2.CutMix` + and :class:`~torchvision.transforms.v2.MixUp`. See + :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py`. +- They're :ref:`faster `. +- They support arbitrary input structures (dicts, lists, tuples, etc.). +- Future improvements and features will be added to the v2 transforms only. + +These transforms are **fully backward compatible** with the v1 ones, so if +you're already using tranforms from ``torchvision.transforms``, all you need to +do to is to update the import to ``torchvision.transforms.v2``. In terms of +output, there might be negligible differences due to implementation differences. + +.. _transforms_perf: + +Performance considerations +-------------------------- - * :ref:`sphx_glr_auto_examples_plot_scripted_tensor_transforms.py` +We recommend the following guidelines to get the best performance out of the +transforms: - .. figure:: ../source/auto_examples/images/sphx_glr_plot_scripted_tensor_transforms_001.png - :align: center - :scale: 30% +- Rely on the v2 transforms from ``torchvision.transforms.v2`` +- Use tensors instead of PIL images +- Use ``torch.uint8`` dtype, especially for resizing +- Resize with bilinear or bicubic mode -.. warning:: +This is what a typical transform pipeline could look like: - Since v0.8.0 all random transformations are using torch default random generator to sample random parameters. - It is a backward compatibility breaking change and user should set the random state as following: +.. code:: python + + from torchvision.transforms import v2 + transforms = v2.Compose([ + v2.ToImage(), # Convert to tensor, only needed if you had a PIL image + v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point + # ... + v2.RandomResizedCrop(size=(224, 224), antialias=True), # Or Resize(antialias=True) + # ... + v2.ToDtype(torch.float32, scale=True), # Normalize expects float input + v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + +The above should give you the best performance in a typical training environment +that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers > +0``. + +Transforms tend to be sensitive to the input strides / memory format. Some +transforms will be faster with channels-first images while others prefer +channels-last. Like ``torch`` operators, most transforms will preserve the +memory format of the input, but this may not always be respected due to +implementation details. You may want to experiment a bit if you're chasing the +very best performance. Using :func:`torch.compile` on individual transforms may +also help factoring out the memory format variable (e.g. on +:class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about +**memory format**, not :ref:`tensor shape `. + +Note that resize transforms like :class:`~torchvision.transforms.v2.Resize` +and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer +channels-last input and tend **not** to benefit from :func:`torch.compile` at +this time. - .. code:: python +.. _functional_transforms: - # Previous versions - # import random - # random.seed(12) +Transform classes, functionals, and kernels +------------------------------------------- - # Now - import torch - torch.manual_seed(17) +Transforms are available as classes like +:class:`~torchvision.transforms.v2.Resize`, but also as functionals like +:func:`~torchvision.transforms.v2.functional.resize` in the +``torchvision.transforms.v2.functional`` namespace. +This is very much like the :mod:`torch.nn` package which defines both classes +and functional equivalents in :mod:`torch.nn.functional`. - Please, keep in mind that the same seed for torch random generator and Python random generator will not - produce the same results. +The functionals support PIL images, pure tensors, or :ref:`TVTensors +`, e.g. both ``resize(image_tensor)`` and ``resize(boxes)`` are +valid. +.. note:: -Scriptable transforms ---------------------- + Random transforms like :class:`~torchvision.transforms.v2.RandomCrop` will + randomly sample some parameter each time they're called. Their functional + counterpart (:func:`~torchvision.transforms.v2.functional.crop`) does not do + any kind of random sampling and thus have a slighlty different + parametrization. The ``get_params()`` class method of the transforms class + can be used to perform parameter sampling when using the functional APIs. -In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`. + +The ``torchvision.transforms.v2.functional`` namespace also contains what we +call the "kernels". These are the low-level functions that implement the +core functionalities for specific types, e.g. ``resize_bounding_boxes`` or +```resized_crop_mask``. They are public, although not documented. Check the +`code +`_ +to see which ones are available (note that those starting with a leading +underscore are **not** public!). Kernels are only really useful if you want +:ref:`torchscript support ` for types like bounding +boxes or masks. + +.. _transforms_torchscript: + +Torchscript support +------------------- + +Most transform classes and functionals support torchscript. For composing +transforms, use :class:`torch.nn.Sequential` instead of +:class:`~torchvision.transforms.v2.Compose`: .. code:: python transforms = torch.nn.Sequential( - transforms.CenterCrop(10), - transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + CenterCrop(10), + Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ) scripted_transforms = torch.jit.script(transforms) -Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor`` and does not require -`lambda` functions or ``PIL.Image``. +.. warning:: -For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``. + v2 transforms support torchscript, but if you call ``torch.jit.script()`` on + a v2 **class** transform, you'll actually end up with its (scripted) v1 + equivalent. This may lead to slightly different results between the + scripted and eager executions due to implementation differences between v1 + and v2. + If you really need torchscript support for the v2 transforms, we recommend + scripting the **functionals** from the + ``torchvision.transforms.v2.functional`` namespace to avoid surprises. -Compositions of transforms --------------------------- + +Also note that the functionals only support torchscript for pure tensors, which +are always treated as images. If you need torchscript support for other types +like bounding boxes or masks, you can rely on the :ref:`low-level kernels +`. + +For any custom transformations to be used with ``torch.jit.script``, they should +be derived from ``torch.nn.Module``. + +See also: :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`. + +.. _v2_api_ref: + +V2 API reference - Recommended +------------------------------ + +Geometry +^^^^^^^^ + +Resizing +"""""""" .. autosummary:: :toctree: generated/ :template: class.rst - Compose + v2.Resize + v2.ScaleJitter + v2.RandomShortestSize + v2.RandomResize +Functionals -Transforms on PIL Image and torch.\*Tensor ------------------------------------------- +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.resize + +Cropping +"""""""" .. autosummary:: :toctree: generated/ :template: class.rst + v2.RandomCrop + v2.RandomResizedCrop + v2.RandomIoUCrop + v2.CenterCrop + v2.FiveCrop + v2.TenCrop + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.crop + v2.functional.resized_crop + v2.functional.ten_crop + v2.functional.center_crop + v2.functional.five_crop + +Others +"""""" + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.RandomHorizontalFlip + v2.RandomVerticalFlip + v2.Pad + v2.RandomZoomOut + v2.RandomRotation + v2.RandomAffine + v2.RandomPerspective + v2.ElasticTransform + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.horizontal_flip + v2.functional.vertical_flip + v2.functional.pad + v2.functional.rotate + v2.functional.affine + v2.functional.perspective + v2.functional.elastic + +Color +^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.ColorJitter + v2.RandomChannelPermutation + v2.RandomPhotometricDistort + v2.Grayscale + v2.RGB + v2.RandomGrayscale + v2.GaussianBlur + v2.GaussianNoise + v2.RandomInvert + v2.RandomPosterize + v2.RandomSolarize + v2.RandomAdjustSharpness + v2.RandomAutocontrast + v2.RandomEqualize + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.permute_channels + v2.functional.rgb_to_grayscale + v2.functional.grayscale_to_rgb + v2.functional.to_grayscale + v2.functional.gaussian_blur + v2.functional.gaussian_noise + v2.functional.invert + v2.functional.posterize + v2.functional.solarize + v2.functional.adjust_sharpness + v2.functional.autocontrast + v2.functional.adjust_contrast + v2.functional.equalize + v2.functional.adjust_brightness + v2.functional.adjust_saturation + v2.functional.adjust_hue + v2.functional.adjust_gamma + + +Composition +^^^^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.Compose + v2.RandomApply + v2.RandomChoice + v2.RandomOrder + +Miscellaneous +^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.LinearTransformation + v2.Normalize + v2.RandomErasing + v2.Lambda + v2.SanitizeBoundingBoxes + v2.ClampBoundingBoxes + v2.ClampKeyPoints + v2.UniformTemporalSubsample + v2.JPEG + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.normalize + v2.functional.erase + v2.functional.sanitize_bounding_boxes + v2.functional.clamp_bounding_boxes + v2.functional.clamp_keypoints + v2.functional.uniform_temporal_subsample + v2.functional.jpeg + +.. _conversion_transforms: + +Conversion +^^^^^^^^^^ + +.. note:: + Beware, some of these conversion transforms below will scale the values + while performing the conversion, while some may not do any scaling. By + scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0, + 255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.ToImage + v2.ToPureTensor + v2.PILToTensor + v2.ToPILImage + v2.ToDtype + v2.ConvertBoundingBoxFormat + +functionals + +.. autosummary:: + :toctree: generated/ + :template: functional.rst + + v2.functional.to_image + v2.functional.pil_to_tensor + v2.functional.to_pil_image + v2.functional.to_dtype + v2.functional.convert_bounding_box_format + + +Deprecated + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.ToTensor + v2.functional.to_tensor + v2.ConvertImageDtype + v2.functional.convert_image_dtype + +Auto-Augmentation +^^^^^^^^^^^^^^^^^ + +`AutoAugment `_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models. +Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that +ImageNet policies provide significant improvements when applied to other datasets. +In TorchVision we implemented 3 policies learned on the following datasets: ImageNet, CIFAR10 and SVHN. +The new transform can be used standalone or mixed-and-matched with existing transforms: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.AutoAugment + v2.RandAugment + v2.TrivialAugmentWide + v2.AugMix + + +CutMix - MixUp +^^^^^^^^^^^^^^ + +CutMix and MixUp are special transforms that +are meant to be used on batches rather than on individual images, because they +are combining pairs of images together. These can be used after the dataloader +(once the samples are batched), or part of a collation function. See +:ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage examples. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.CutMix + v2.MixUp + +Developer tools +^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.Transform + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.register_kernel + v2.query_size + v2.query_chw + v2.get_bounding_boxes + + +V1 API Reference +---------------- + +Geometry +^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Resize + RandomCrop + RandomResizedCrop CenterCrop - ColorJitter FiveCrop - Grayscale + TenCrop Pad + RandomRotation RandomAffine - RandomApply - RandomCrop - RandomGrayscale - RandomHorizontalFlip RandomPerspective - RandomResizedCrop - RandomRotation + ElasticTransform + RandomHorizontalFlip RandomVerticalFlip - Resize - TenCrop + + +Color +^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + ColorJitter + Grayscale + RandomGrayscale GaussianBlur RandomInvert RandomPosterize @@ -130,23 +575,20 @@ Transforms on PIL Image and torch.\*Tensor RandomAutocontrast RandomEqualize - -.. _transforms_pil_only: - -Transforms on PIL Image only ----------------------------- +Composition +^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :template: class.rst + Compose + RandomApply RandomChoice RandomOrder -.. _transforms_tensor_only: - -Transforms on torch.\*Tensor only ---------------------------------- +Miscellaneous +^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ @@ -155,12 +597,16 @@ Transforms on torch.\*Tensor only LinearTransformation Normalize RandomErasing - ConvertImageDtype + Lambda -.. _conversion_transforms: +Conversion +^^^^^^^^^^ -Conversion Transforms ---------------------- +.. note:: + Beware, some of these conversion transforms below will scale the values + while performing the conversion, while some may not do any scaling. By + scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0, + 255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`. .. autosummary:: :toctree: generated/ @@ -169,20 +615,10 @@ Conversion Transforms ToPILImage ToTensor PILToTensor + ConvertImageDtype - -Generic Transforms ------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - Lambda - - -Automatic Augmentation Transforms ---------------------------------- +Auto-Augmentation +^^^^^^^^^^^^^^^^^ `AutoAugment `_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that @@ -200,57 +636,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran TrivialAugmentWide AugMix -.. _functional_transforms: + Functional Transforms ---------------------- +^^^^^^^^^^^^^^^^^^^^^ .. currentmodule:: torchvision.transforms.functional -Functional transforms give you fine-grained control of the transformation pipeline. -As opposed to the transformations above, functional transforms don't contain a random number -generator for their parameters. -That means you have to specify/generate all parameters, but the functional transform will give you -reproducible results across calls. - -Example: -you can apply a functional transform with the same parameters to multiple images like this: - -.. code:: python - - import torchvision.transforms.functional as TF - import random - - def my_segmentation_transforms(image, segmentation): - if random.random() > 0.5: - angle = random.randint(-30, 30) - image = TF.rotate(image, angle) - segmentation = TF.rotate(segmentation, angle) - # more transforms ... - return image, segmentation - - -Example: -you can use a functional transform to build transform classes with custom behavior: - -.. code:: python - - import torchvision.transforms.functional as TF - import random - - class MyRotationTransform: - """Rotate by one of the given angles.""" - - def __init__(self, angles): - self.angles = angles - - def __call__(self, x): - angle = random.choice(self.angles) - return TF.rotate(x, angle) - - rotation_transform = MyRotationTransform(angles=[-30, -15, 0, 15, 30]) - - .. autosummary:: :toctree: generated/ :template: function.rst diff --git a/docs/source/tv_tensors.rst b/docs/source/tv_tensors.rst new file mode 100644 index 00000000000..d292012fdf8 --- /dev/null +++ b/docs/source/tv_tensors.rst @@ -0,0 +1,30 @@ +.. _tv_tensors: + +TVTensors +========== + +.. currentmodule:: torchvision.tv_tensors + +TVTensors are :class:`torch.Tensor` subclasses which the v2 :ref:`transforms +` use under the hood to dispatch their inputs to the appropriate +lower-level kernels. Most users do not need to manipulate TVTensors directly. + +Refer to +:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` for +an introduction to TVTensors, or +:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py` for more advanced +info. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Image + Video + KeyPoints + BoundingBoxFormat + BoundingBoxes + Mask + TVTensor + set_return_type + wrap diff --git a/docs/source/utils.rst b/docs/source/utils.rst index 276f730c294..cda04de900a 100644 --- a/docs/source/utils.rst +++ b/docs/source/utils.rst @@ -4,7 +4,7 @@ Utils ===== The ``torchvision.utils`` module contains various utilities, mostly :ref:`for -vizualization `. +visualization `. .. currentmodule:: torchvision.utils diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt new file mode 100644 index 00000000000..a1329b0c968 --- /dev/null +++ b/examples/cpp/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.10) +project(run_model) + +option(USE_TORCHVISION "Whether to link to torchvision" OFF) + +find_package(Torch REQUIRED) +if(USE_TORCHVISION) + find_package(TorchVision REQUIRED) +endif() + +add_executable(run_model run_model.cpp) + +target_link_libraries(run_model "${TORCH_LIBRARIES}") +if(USE_TORCHVISION) + target_link_libraries(run_model TorchVision::TorchVision) +endif() + +set_property(TARGET run_model PROPERTY CXX_STANDARD 17) diff --git a/examples/cpp/README.md b/examples/cpp/README.md new file mode 100644 index 00000000000..b2a9174c8ba --- /dev/null +++ b/examples/cpp/README.md @@ -0,0 +1,101 @@ +Using torchvision models in C++ +=============================== + +This is a minimal example of getting TorchVision models to work in C++ with +Torchscript. The model is first scripted in Python and exported to a file, and +then loaded in C++. For a similar tutorial, see [this +tutorial](https://pytorch.org/tutorials/advanced/cpp_export.html). + +In order to successfully compile this example, make sure you have ``LibTorch`` +installed. You can either: + +- Install PyTorch normally +- Or download the LibTorch C++ distribution. + +In both cases refer [here](https://pytorch.org/get-started/locally/) the +corresponding install or download instructions. + +Some torchvision models only depend on PyTorch operators, and can be used in C++ +without depending on the torchvision lib. Other models rely on torchvision's C++ +operators like NMS, RoiAlign (typically the detection models) and those need to +be linked against the torchvision lib. + +We'll first see the simpler case of running a model without the torchvision lib +dependency. + +Running a model that doesn't need torchvision lib +------------------------------------------------- + +Create a ``build`` directory inside the current one. + +```bash +mkdir build +cd build +``` + +Then run `python ../trace_model.py` which should create a `resnet18.pt` file in +the build directory. This is the scripted model that will be used in the C++ +code. + +We can now start building with CMake. We have to tell CMake where it can find +the necessary PyTorch resources. If you installed PyTorch normally, you can do: + +```bash +TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))") +Torch_DIR="${TORCH_PATH}/share/cmake/Torch" # there should be .cmake files in there + +cmake .. -DTorch_DIR=$Torch_DIR +``` + +If instead you downloaded the LibTorch somewhere, you can do: + +```bash +cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch +``` + +Then `cmake --build .` and you should now be able to run + +```bash +./run_model resnet18.pt +``` + +If you try to run the model with a model that depends on the torchvision lib, like +`./run_model fasterrcnn_resnet50_fpn.pt`, you should get a runtime error. This is +because the executable wasn't linked against the torchvision lib. + + +Running a model that needs torchvision lib +------------------------------------------ + +First, we need to build the torchvision lib. To build the torchvision lib go to +the root of the torchvision project and run: + +```bash +mkdir build +cd build +cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch # or -DTorch_DIR= if you installed PyTorch normally, see above +cmake --build . +cmake --install . +``` + +You may want to pass `-DCMAKE_INSTALL_PREFIX=/path/to/libtorchvision` for +cmake to copy/install the files to a specific location (e.g. `$CONDA_PREFIX`). + +**DISCLAIMER**: the `libtorchvision` library includes the torchvision +custom ops as well as most of the C++ torchvision APIs. Those APIs do not come +with any backward-compatibility guarantees and may change from one version to +the next. Only the Python APIs are stable and with backward-compatibility +guarantees. So, if you need stability within a C++ environment, your best bet is +to export the Python APIs via torchscript. + +Now that libtorchvision is built and installed we can tell our project to use +and link to it via the `-DUSE_TORCHVISION` flag. We also need to tell CMake +where to find it, just like we did with LibTorch, e.g.: + +```bash +cmake .. -DTorch_DIR=$Torch_DIR -DTorchVision_DIR=path/to/libtorchvision -DUSE_TORCHVISION=ON +cmake --build . +``` + +Now the `run_model` executable should be able to run the +`fasterrcnn_resnet50_fpn.pt` file. diff --git a/examples/cpp/hello_world/CMakeLists.txt b/examples/cpp/hello_world/CMakeLists.txt deleted file mode 100644 index 3ca59e4c199..00000000000 --- a/examples/cpp/hello_world/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 3.10) -project(hello-world) - -# The first thing do is to tell cmake to find the TorchVision library. -# The package pulls in all the necessary torch libraries, -# so there is no need to also add `find_package(Torch)` here. -find_package(TorchVision REQUIRED) - -# This due to LibTorch's version is the one included in the Python -# package that links to Python. -find_package(Python3 COMPONENTS Development) - -add_executable(hello-world main.cpp) - -# We now need to link the TorchVision library to our executable. -# We can do that by using the TorchVision::TorchVision target, -# which also adds all the necessary torch dependencies. -target_compile_features(hello-world PUBLIC cxx_range_for) -target_link_libraries(hello-world TorchVision::TorchVision) -set_property(TARGET hello-world PROPERTY CXX_STANDARD 14) diff --git a/examples/cpp/hello_world/README.rst b/examples/cpp/hello_world/README.rst deleted file mode 100644 index 81161112034..00000000000 --- a/examples/cpp/hello_world/README.rst +++ /dev/null @@ -1,19 +0,0 @@ -Hello World! -============ - -This is a minimal example of getting TorchVision to work in C++ with CMake. - - -In order to successfully compile this example, make sure you have both ``LibTorch`` and -``TorchVision`` installed. -Once both dependencies are sorted, we can start the CMake fun: - -1) Create a ``build`` directory inside the current one. -2) from within the ``build`` directory, run the following commands: - - | ``cmake -DCMAKE_PREFIX_PATH=";" ..`` - | where ```` and ```` are the paths to the libtorch and torchvision installations. - - ``cmake --build .`` - -| That's it! -| You should now have a ``hello-world`` executable in your ``build`` folder. - Running it will output a (fairly long) tensor of random values to your terminal. diff --git a/examples/cpp/hello_world/main.cpp b/examples/cpp/hello_world/main.cpp deleted file mode 100644 index bcbe68dd07d..00000000000 --- a/examples/cpp/hello_world/main.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include -#include -#include - -int main() { - torch::DeviceType device_type; - device_type = torch::kCPU; - - torch::jit::script::Module model; - try { - std::cout << "Loading model\n"; - // Deserialize the ScriptModule from a file using torch::jit::load(). - model = torch::jit::load("resnet18.pt"); - std::cout << "Model loaded\n"; - } catch (const torch::Error& e) { - std::cout << "error loading the model\n"; - return -1; - } catch (const std::exception& e) { - std::cout << "Other error: " << e.what() << "\n"; - return -1; - } - - // TorchScript models require a List[IValue] as input - std::vector inputs; - - // Create a random input tensor and run it through the model. - inputs.push_back(torch::rand({1, 3, 10, 10})); - auto out = model.forward(inputs); - std::cout << out << "\n"; - - if (torch::cuda::is_available()) { - // Move model and inputs to GPU - model.to(torch::kCUDA); - - // Add GPU inputs - inputs.clear(); - torch::TensorOptions options = torch::TensorOptions{torch::kCUDA}; - inputs.push_back(torch::rand({1, 3, 10, 10}, options)); - - auto gpu_out = model.forward(inputs); - std::cout << gpu_out << "\n"; - } -} diff --git a/examples/cpp/hello_world/trace_model.py b/examples/cpp/hello_world/trace_model.py deleted file mode 100644 index 41bbaf8b6dd..00000000000 --- a/examples/cpp/hello_world/trace_model.py +++ /dev/null @@ -1,13 +0,0 @@ -import os.path as osp - -import torch -import torchvision - -HERE = osp.dirname(osp.abspath(__file__)) -ASSETS = osp.dirname(osp.dirname(HERE)) - -model = torchvision.models.resnet18() -model.eval() - -traced_model = torch.jit.script(model) -traced_model.save("resnet18.pt") diff --git a/examples/cpp/run_model.cpp b/examples/cpp/run_model.cpp new file mode 100644 index 00000000000..11faead5dac --- /dev/null +++ b/examples/cpp/run_model.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#endif // _WIN32 + +int main(int argc, const char* argv[]) { + if (argc != 2) { + std::cout << "Usage: run_model \n"; + return -1; + } + torch::DeviceType device_type; + device_type = torch::kCPU; + + torch::jit::script::Module model; + try { + std::cout << "Loading model\n"; + // Deserialize the ScriptModule from a file using torch::jit::load(). + model = torch::jit::load(argv[1]); + std::cout << "Model loaded\n"; + } catch (const torch::Error&) { + std::cout << "error loading the model.\n"; + return -1; + } catch (const std::exception& e) { + std::cout << "Other error: " << e.what() << "\n"; + return -1; + } + + // TorchScript models require a List[IValue] as input + std::vector inputs; + + if (std::strstr(argv[1], "fasterrcnn") != NULL) { + // Faster RCNN accepts a List[Tensor] as main input + std::vector images; + images.push_back(torch::rand({3, 256, 275})); + images.push_back(torch::rand({3, 256, 275})); + inputs.push_back(images); + } else { + inputs.push_back(torch::rand({1, 3, 10, 10})); + } + auto out = model.forward(inputs); + std::cout << out << "\n"; + + if (torch::cuda::is_available()) { + // Move model and inputs to GPU + model.to(torch::kCUDA); + + // Add GPU inputs + inputs.clear(); + torch::TensorOptions options = torch::TensorOptions{torch::kCUDA}; + if (std::strstr(argv[1], "fasterrcnn") != NULL) { + // Faster RCNN accepts a List[Tensor] as main input + std::vector images; + images.push_back(torch::rand({3, 256, 275}, options)); + images.push_back(torch::rand({3, 256, 275}, options)); + inputs.push_back(images); + } else { + inputs.push_back(torch::rand({1, 3, 10, 10}, options)); + } + + auto gpu_out = model.forward(inputs); + std::cout << gpu_out << "\n"; + } +} diff --git a/examples/cpp/script_model.py b/examples/cpp/script_model.py new file mode 100644 index 00000000000..e91e888e7be --- /dev/null +++ b/examples/cpp/script_model.py @@ -0,0 +1,10 @@ +import torch +from torchvision import models + +for model, name in ( + (models.resnet18(weights=None), "resnet18"), + (models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None), "fasterrcnn_resnet50_fpn"), +): + model.eval() + traced_model = torch.jit.script(model) + traced_model.save(f"{name}.pt") diff --git a/gallery/README.rst b/gallery/README.rst index 868afe74351..8dfea355276 100644 --- a/gallery/README.rst +++ b/gallery/README.rst @@ -1,4 +1,4 @@ -Example gallery -=============== +.. _gallery: -Below is a gallery of examples +Examples and tutorials +====================== diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg new file mode 120000 index 00000000000..9be80c7c273 --- /dev/null +++ b/gallery/assets/coco/images/000000000001.jpg @@ -0,0 +1 @@ +../../astronaut.jpg \ No newline at end of file diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg new file mode 120000 index 00000000000..9f8efef9928 --- /dev/null +++ b/gallery/assets/coco/images/000000000002.jpg @@ -0,0 +1 @@ +../../dog2.jpg \ No newline at end of file diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json new file mode 100644 index 00000000000..fe0e09270bf --- /dev/null +++ b/gallery/assets/coco/instances.json @@ -0,0 +1 @@ +{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]} diff --git a/gallery/assets/leaning_tower.jpg b/gallery/assets/leaning_tower.jpg new file mode 100644 index 00000000000..fc6e0779f7c Binary files /dev/null and b/gallery/assets/leaning_tower.jpg differ diff --git a/gallery/others/README.rst b/gallery/others/README.rst new file mode 100644 index 00000000000..fafb007d985 --- /dev/null +++ b/gallery/others/README.rst @@ -0,0 +1,2 @@ +Others +------ diff --git a/gallery/plot_optical_flow.py b/gallery/others/plot_optical_flow.py similarity index 90% rename from gallery/plot_optical_flow.py rename to gallery/others/plot_optical_flow.py index b0a93209877..6296c8e667e 100644 --- a/gallery/plot_optical_flow.py +++ b/gallery/others/plot_optical_flow.py @@ -3,6 +3,10 @@ Optical Flow: Predicting movement with the RAFT model ===================================================== +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + Optical flow is the task of predicting movement between two images, usually two consecutive frames of a video. Optical flow models take two images as input, and predict a flow: the flow indicates the displacement of every single pixel in the @@ -42,7 +46,7 @@ def plot(imgs, **imshow_kwargs): plt.tight_layout() -################################### +# %% # Reading Videos Using Torchvision # -------------------------------- # We will first read a video using :func:`~torchvision.io.read_video`. @@ -62,7 +66,7 @@ def plot(imgs, **imshow_kwargs): video_path = Path(tempfile.mkdtemp()) / "basketball.mp4" _ = urlretrieve(video_url, video_path) -######################### +# %% # :func:`~torchvision.io.read_video` returns the video frames, audio frames and # the metadata associated with the video. In our case, we only need the video # frames. @@ -79,11 +83,12 @@ def plot(imgs, **imshow_kwargs): plot(img1_batch) -######################### +# %% # The RAFT model accepts RGB images. We first get the frames from -# :func:`~torchvision.io.read_video` and resize them to ensure their -# dimensions are divisible by 8. Then we use the transforms bundled into the -# weights in order to preprocess the input and rescale its values to the +# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions +# are divisible by 8. Note that we explicitly use ``antialias=False``, because +# this is how those models were trained. Then we use the transforms bundled into +# the weights in order to preprocess the input and rescale its values to the # required ``[-1, 1]`` interval. from torchvision.models.optical_flow import Raft_Large_Weights @@ -93,8 +98,8 @@ def plot(imgs, **imshow_kwargs): def preprocess(img1_batch, img2_batch): - img1_batch = F.resize(img1_batch, size=[520, 960]) - img2_batch = F.resize(img2_batch, size=[520, 960]) + img1_batch = F.resize(img1_batch, size=[520, 960], antialias=False) + img2_batch = F.resize(img2_batch, size=[520, 960], antialias=False) return transforms(img1_batch, img2_batch) @@ -103,7 +108,7 @@ def preprocess(img1_batch, img2_batch): print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}") -#################################### +# %% # Estimating Optical flow using RAFT # ---------------------------------- # We will use our RAFT implementation from @@ -124,12 +129,12 @@ def preprocess(img1_batch, img2_batch): print(f"type = {type(list_of_flows)}") print(f"length = {len(list_of_flows)} = number of iterations of the model") -#################################### +# %% # The RAFT model outputs lists of predicted flows where each entry is a # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration" # in the model. For more details on the iterative nature of the model, please # refer to the `original paper `_. Here, we -# are only interested in the final predicted flows (they are the most acccurate +# are only interested in the final predicted flows (they are the most accurate # ones), so we will just retrieve the last item in the list. # # As described above, a flow is a tensor with dimensions (2, H, W) (or (N, 2, H, @@ -143,10 +148,10 @@ def preprocess(img1_batch, img2_batch): print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}") -#################################### +# %% # Visualizing predicted flows # --------------------------- -# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to +# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utility to # convert a flow into an RGB image. It also supports batches of flows. # each "direction" in the flow will be mapped to a given RGB color. In the # images below, pixels with similar colors are assumed by the model to be moving @@ -165,7 +170,7 @@ def preprocess(img1_batch, img2_batch): grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)] plot(grid) -#################################### +# %% # Bonus: Creating GIFs of predicted flows # --------------------------------------- # In the example above we have only shown the predicted flows of 2 pairs of @@ -186,7 +191,7 @@ def preprocess(img1_batch, img2_batch): # output_folder = "/tmp/" # Update this to the folder of your choice # write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg") -#################################### +# %% # Once the .jpg flow images are saved, you can convert them into a video or a # GIF using ffmpeg with e.g.: # diff --git a/gallery/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py similarity index 91% rename from gallery/plot_repurposing_annotations.py rename to gallery/others/plot_repurposing_annotations.py index 7bb68617a17..2c2e10ffb2a 100644 --- a/gallery/plot_repurposing_annotations.py +++ b/gallery/others/plot_repurposing_annotations.py @@ -3,6 +3,10 @@ Repurposing masks into bounding boxes ===================================== +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + The following example illustrates the operations available the :ref:`torchvision.ops ` module for repurposing segmentation masks into object localization annotations for different tasks @@ -20,7 +24,7 @@ import torchvision.transforms.functional as F -ASSETS_DIRECTORY = "assets" +ASSETS_DIRECTORY = "../assets" plt.rcParams["savefig.bbox"] = "tight" @@ -36,7 +40,7 @@ def show(imgs): axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) -#################################### +# %% # Masks # ----- # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package, @@ -53,7 +57,7 @@ def show(imgs): # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object # localization tasks. -#################################### +# %% # Converting Masks to Bounding Boxes # ----------------------------------------------- # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to @@ -62,15 +66,15 @@ def show(imgs): # We will take images and masks from the `PenFudan Dataset `_. -from torchvision.io import read_image +from torchvision.io import decode_image img_path = os.path.join(ASSETS_DIRECTORY, "FudanPed00054.png") mask_path = os.path.join(ASSETS_DIRECTORY, "FudanPed00054_mask.png") -img = read_image(img_path) -mask = read_image(mask_path) +img = decode_image(img_path) +mask = decode_image(mask_path) -######################### +# %% # Here the masks are represented as a PNG Image, with floating point values. # Each pixel is encoded as different colors, with 0 being background. # Notice that the spatial dimensions of image and mask match. @@ -79,7 +83,7 @@ def show(imgs): print(img.size()) print(mask) -############################ +# %% # We get the unique colors, as these would be the object ids. obj_ids = torch.unique(mask) @@ -91,7 +95,7 @@ def show(imgs): # Note that this snippet would work as well if the masks were float values instead of ints. masks = mask == obj_ids[:, None, None] -######################## +# %% # Now the masks are a boolean tensor. # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image. # The other two dimensions are height and width, which are equal to the dimensions of the image. @@ -101,7 +105,7 @@ def show(imgs): print(masks.size()) print(masks) -#################################### +# %% # Let us visualize an image and plot its corresponding segmentation masks. # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks. @@ -113,7 +117,7 @@ def show(imgs): show(drawn_masks) -#################################### +# %% # To convert the boolean masks into bounding boxes. # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format. @@ -124,7 +128,7 @@ def show(imgs): print(boxes.size()) print(boxes) -#################################### +# %% # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format. # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility # provided in :ref:`torchvision.utils `. @@ -134,7 +138,7 @@ def show(imgs): drawn_boxes = draw_bounding_boxes(img, boxes, colors="red") show(drawn_boxes) -################################### +# %% # These boxes can now directly be used by detection models in torchvision. # Here is demo with a Faster R-CNN model loaded from # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` @@ -145,15 +149,15 @@ def show(imgs): model = fasterrcnn_resnet50_fpn(weights=weights, progress=False) print(img.size()) -tranforms = weights.transforms() -img = tranforms(img) +transforms = weights.transforms() +img = transforms(img) target = {} target["boxes"] = boxes target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64) detection_outputs = model(img.unsqueeze(0), [target]) -#################################### +# %% # Converting Segmentation Dataset to Detection Dataset # ---------------------------------------------------- # @@ -177,8 +181,8 @@ def __getitem__(self, idx): img_path = os.path.join(self.root, "PNGImages", self.imgs[idx]) mask_path = os.path.join(self.root, "PedMasks", self.masks[idx]) - img = read_image(img_path) - mask = read_image(mask_path) + img = decode_image(img_path) + mask = decode_image(mask_path) img = F.convert_image_dtype(img, dtype=torch.float) mask = F.convert_image_dtype(mask, dtype=torch.float) diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py new file mode 100644 index 00000000000..da2213347e3 --- /dev/null +++ b/gallery/others/plot_scripted_tensor_transforms.py @@ -0,0 +1,136 @@ +""" +=================== +Torchscript support +=================== + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +This example illustrates `torchscript +`_ support of the torchvision +:ref:`transforms ` on Tensor images. +""" + +# %% +from pathlib import Path + +import matplotlib.pyplot as plt + +import torch +import torch.nn as nn + +import torchvision.transforms as v1 +from torchvision.io import decode_image + +plt.rcParams["savefig.bbox"] = 'tight' +torch.manual_seed(1) + +# If you're trying to run that on Colab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +import sys +sys.path += ["../transforms"] +from helpers import plot +ASSETS_PATH = Path('../assets') + + +# %% +# Most transforms support torchscript. For composing transforms, we use +# :class:`torch.nn.Sequential` instead of +# :class:`~torchvision.transforms.v2.Compose`: + +dog1 = decode_image(str(ASSETS_PATH / 'dog1.jpg')) +dog2 = decode_image(str(ASSETS_PATH / 'dog2.jpg')) + +transforms = torch.nn.Sequential( + v1.RandomCrop(224), + v1.RandomHorizontalFlip(p=0.3), +) + +scripted_transforms = torch.jit.script(transforms) + +plot([dog1, scripted_transforms(dog1), dog2, scripted_transforms(dog2)]) + + +# %% +# .. warning:: +# +# Above we have used transforms from the ``torchvision.transforms`` +# namespace, i.e. the "v1" transforms. The v2 transforms from the +# ``torchvision.transforms.v2`` namespace are the :ref:`recommended +# ` way to use transforms in your code. +# +# The v2 transforms also support torchscript, but if you call +# ``torch.jit.script()`` on a v2 **class** transform, you'll actually end up +# with its (scripted) v1 equivalent. This may lead to slightly different +# results between the scripted and eager executions due to implementation +# differences between v1 and v2. +# +# If you really need torchscript support for the v2 transforms, **we +# recommend scripting the functionals** from the +# ``torchvision.transforms.v2.functional`` namespace to avoid surprises. +# +# Below we now show how to combine image transformations and a model forward +# pass, while using ``torch.jit.script`` to obtain a single scripted module. +# +# Let's define a ``Predictor`` module that transforms the input tensor and then +# applies an ImageNet model on it. + +from torchvision.models import resnet18, ResNet18_Weights + + +class Predictor(nn.Module): + + def __init__(self): + super().__init__() + weights = ResNet18_Weights.DEFAULT + self.resnet18 = resnet18(weights=weights, progress=False).eval() + self.transforms = weights.transforms(antialias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + x = self.transforms(x) + y_pred = self.resnet18(x) + return y_pred.argmax(dim=1) + + +# %% +# Now, let's define scripted and non-scripted instances of ``Predictor`` and +# apply it on multiple tensor images of the same size + +device = "cuda" if torch.cuda.is_available() else "cpu" + +predictor = Predictor().to(device) +scripted_predictor = torch.jit.script(predictor).to(device) + +batch = torch.stack([dog1, dog2]).to(device) + +res = predictor(batch) +res_scripted = scripted_predictor(batch) + +# %% +# We can verify that the prediction of the scripted and non-scripted models are +# the same: + +import json + +with open(Path('../assets') / 'imagenet_class_index.json') as labels_file: + labels = json.load(labels_file) + +for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)): + assert pred == pred_scripted + print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}") + +# %% +# Since the model is scripted, it can be easily dumped on disk and re-used + +import tempfile + +with tempfile.NamedTemporaryFile() as f: + scripted_predictor.save(f.name) + + dumped_scripted_predictor = torch.jit.load(f.name) + res_scripted_dumped = dumped_scripted_predictor(batch) +assert (res_scripted_dumped == res_scripted).all() + +# %% diff --git a/gallery/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py similarity index 82% rename from gallery/plot_visualization_utils.py rename to gallery/others/plot_visualization_utils.py index b04e0b6cffa..72c35b53717 100644 --- a/gallery/plot_visualization_utils.py +++ b/gallery/others/plot_visualization_utils.py @@ -3,6 +3,10 @@ Visualization utilities ======================= +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + This example illustrates some of the utilities that torchvision offers for visualizing images, bounding boxes, segmentation masks and keypoints. """ @@ -30,7 +34,7 @@ def show(imgs): axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) -#################################### +# %% # Visualizing a grid of images # ---------------------------- # The :func:`~torchvision.utils.make_grid` function can be used to create a @@ -38,17 +42,17 @@ def show(imgs): # image of dtype ``uint8`` as input. from torchvision.utils import make_grid -from torchvision.io import read_image +from torchvision.io import decode_image from pathlib import Path -dog1_int = read_image(str(Path('assets') / 'dog1.jpg')) -dog2_int = read_image(str(Path('assets') / 'dog2.jpg')) +dog1_int = decode_image(str(Path('../assets') / 'dog1.jpg')) +dog2_int = decode_image(str(Path('../assets') / 'dog2.jpg')) dog_list = [dog1_int, dog2_int] grid = make_grid(dog_list) show(grid) -#################################### +# %% # Visualizing bounding boxes # -------------------------- # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an @@ -64,7 +68,7 @@ def show(imgs): show(result) -##################################### +# %% # Naturally, we can also plot bounding boxes produced by torchvision detection # models. Here is a demo with a Faster R-CNN model loaded from # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` @@ -85,7 +89,7 @@ def show(imgs): outputs = model(images) print(outputs) -##################################### +# %% # Let's plot the boxes detected by our model. We will only plot the boxes with a # score greater than a given threshold. @@ -96,7 +100,7 @@ def show(imgs): ] show(dogs_with_boxes) -##################################### +# %% # Visualizing segmentation masks # ------------------------------ # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to @@ -125,7 +129,7 @@ def show(imgs): output = model(batch)['out'] print(output.shape, output.min().item(), output.max().item()) -##################################### +# %% # As we can see above, the output of the segmentation model is a tensor of shape # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax, @@ -147,7 +151,7 @@ def show(imgs): show(dog_and_boat_masks) -##################################### +# %% # As expected, the model is confident about the dog class, but not so much for # the boat class. # @@ -162,7 +166,7 @@ def show(imgs): show([m.float() for m in boolean_dog_masks]) -##################################### +# %% # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you # can read it as the following query: "For which pixels is 'dog' the most likely # class?" @@ -184,11 +188,11 @@ def show(imgs): ] show(dogs_with_masks) -##################################### +# %% # We can plot more than one mask per image! Remember that the model returned as # many masks as there are classes. Let's ask the same query as above, but this # time for *all* classes, not just the dog class: "For each pixel and each class -# C, is class C the most most likely class?" +# C, is class C the most likely class?" # # This one is a bit more involved, so we'll first show how to do it with a # single image, and then we'll generalize to the batch @@ -204,7 +208,7 @@ def show(imgs): dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6) show(dog_with_all_masks) -##################################### +# %% # We can see in the image above that only 2 masks were drawn: the mask for the # background and the mask for the dog. This is because the model thinks that # only these 2 classes are the most likely ones across all the pixels. If the @@ -231,7 +235,7 @@ def show(imgs): show(dogs_with_masks) -##################################### +# %% # .. _instance_seg_output: # # Instance segmentation models @@ -265,7 +269,7 @@ def show(imgs): output = model(images) print(output) -##################################### +# %% # Let's break this down. For each image in the batch, the model outputs some # detections (or instances). The number of detections varies for each input # image. Each instance is described by its bounding box, its label, its score @@ -288,7 +292,7 @@ def show(imgs): print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, " f"min = {dog1_masks.min()}, max = {dog1_masks.max()}") -##################################### +# %% # Here the masks correspond to probabilities indicating, for each pixel, how # likely it is to belong to the predicted label of that instance. Those # predicted labels correspond to the 'labels' element in the same output dict. @@ -297,7 +301,7 @@ def show(imgs): print("For the first dog, the following instances were detected:") print([weights.meta["categories"][label] for label in dog1_output['labels']]) -##################################### +# %% # Interestingly, the model detects two persons in the image. Let's go ahead and # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks` # expects boolean masks, we need to convert those probabilities into boolean @@ -315,14 +319,14 @@ def show(imgs): show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9)) -##################################### +# %% # The model seems to have properly detected the dog, but it also confused trees -# with people. Looking more closely at the scores will help us plotting more +# with people. Looking more closely at the scores will help us plot more # relevant masks: print(dog1_output['scores']) -##################################### +# %% # Clearly the model is more confident about the dog detection than it is about # the people detections. That's good news. When plotting the masks, we can ask # for only those that have a good score. Let's use a score threshold of .75 @@ -341,12 +345,12 @@ def show(imgs): ] show(dogs_with_masks) -##################################### +# %% # The two 'people' masks in the first image where not selected because they have -# a lower score than the score threshold. Similarly in the second image, the +# a lower score than the score threshold. Similarly, in the second image, the # instance with class 15 (which corresponds to 'bench') was not selected. -##################################### +# %% # .. _keypoint_output: # # Visualizing keypoints @@ -358,9 +362,9 @@ def show(imgs): # from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights -from torchvision.io import read_image +from torchvision.io import decode_image -person_int = read_image(str(Path("assets") / "person1.jpg")) +person_int = decode_image(str(Path("../assets") / "person1.jpg")) weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT transforms = weights.transforms() @@ -373,7 +377,7 @@ def show(imgs): outputs = model([person_float]) print(outputs) -##################################### +# %% # As we see the output contains a list of dictionaries. # The output list is of length batch_size. # We currently have just a single image so length of list is 1. @@ -388,7 +392,7 @@ def show(imgs): print(kpts) print(scores) -##################################### +# %% # The KeypointRCNN model detects there are two instances in the image. # If you plot the boxes by using :func:`~draw_bounding_boxes` # you would recognize they are the person and the surfboard. @@ -402,7 +406,7 @@ def show(imgs): print(keypoints) -##################################### +# %% # Great, now we have the keypoints corresponding to the person. # Each keypoint is represented by x, y coordinates and the visibility. # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints. @@ -413,8 +417,8 @@ def show(imgs): res = draw_keypoints(person_int, keypoints, colors="blue", radius=3) show(res) -##################################### -# As we see the keypoints appear as colored circles over the image. +# %% +# As we see, the keypoints appear as colored circles over the image. # The coco keypoints for a person are ordered and represent the following list.\ coco_keypoints = [ @@ -424,7 +428,7 @@ def show(imgs): "left_knee", "right_knee", "left_ankle", "right_ankle", ] -##################################### +# %% # What if we are interested in joining the keypoints? # This is especially useful in creating pose detection or action recognition. # We can join the keypoints easily using the `connectivity` parameter. @@ -450,9 +454,69 @@ def show(imgs): (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16) ] -##################################### +# %% # We pass the above list to the connectivity parameter to connect the keypoints. # res = draw_keypoints(person_int, keypoints, connectivity=connect_skeleton, colors="blue", radius=4, width=3) show(res) + +# %% +# That looks pretty good. +# +# .. _draw_keypoints_with_visibility: +# +# Drawing Keypoints with Visibility +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Let's have a look at the results, another keypoint prediction module produced, and show the connectivity: + +prediction = torch.tensor( + [[[208.0176, 214.2409, 1.0000], + [000.0000, 000.0000, 0.0000], + [197.8246, 210.6392, 1.0000], + [000.0000, 000.0000, 0.0000], + [178.6378, 217.8425, 1.0000], + [221.2086, 253.8591, 1.0000], + [160.6502, 269.4662, 1.0000], + [243.9929, 304.2822, 1.0000], + [138.4654, 328.8935, 1.0000], + [277.5698, 340.8990, 1.0000], + [153.4551, 374.5145, 1.0000], + [000.0000, 000.0000, 0.0000], + [226.0053, 370.3125, 1.0000], + [221.8081, 455.5516, 1.0000], + [273.9723, 448.9486, 1.0000], + [193.6275, 546.1933, 1.0000], + [273.3727, 545.5930, 1.0000]]] +) + +res = draw_keypoints(person_int, prediction, connectivity=connect_skeleton, colors="blue", radius=4, width=3) +show(res) + +# %% +# What happened there? +# The model, which predicted the new keypoints, +# can't detect the three points that are hidden on the upper left body of the skateboarder. +# More precisely, the model predicted that `(x, y, vis) = (0, 0, 0)` for the left_eye, left_ear, and left_hip. +# So we definitely don't want to display those keypoints and connections, and you don't have to. +# Looking at the parameters of :func:`~torchvision.utils.draw_keypoints`, +# we can see that we can pass a visibility tensor as an additional argument. +# Given the models' prediction, we have the visibility as the third keypoint dimension, we just need to extract it. +# Let's split the ``prediction`` into the keypoint coordinates and their respective visibility, +# and pass both of them as arguments to :func:`~torchvision.utils.draw_keypoints`. + +coordinates, visibility = prediction.split([2, 1], dim=-1) +visibility = visibility.bool() + +res = draw_keypoints( + person_int, coordinates, visibility=visibility, connectivity=connect_skeleton, colors="blue", radius=4, width=3 +) +show(res) + +# %% +# We can see that the undetected keypoints are not draw and the invisible keypoint connections were skipped. +# This can reduce the noise on images with multiple detections, or in cases like ours, +# when the keypoint-prediction model missed some detections. +# Most torch keypoint-prediction models return the visibility for every prediction, ready for you to use it. +# The :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn` model, +# which we used in the first case, does so too. diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py deleted file mode 100644 index 995383d4603..00000000000 --- a/gallery/plot_scripted_tensor_transforms.py +++ /dev/null @@ -1,141 +0,0 @@ -""" -========================= -Tensor transforms and JIT -========================= - -This example illustrates various features that are now supported by the -:ref:`image transformations ` on Tensor images. In particular, we -show how image transforms can be performed on GPU, and how one can also script -them using JIT compilation. - -Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric -and presented multiple limitations due to that. Now, since v0.8.0, transforms -implementations are Tensor and PIL compatible and we can achieve the following -new features: - -- transform multi-band torch tensor images (with more than 3-4 channels) -- torchscript transforms together with your model for deployment -- support for GPU acceleration -- batched transformation such as for videos -- read and decode data directly as torch tensor with torchscript support (for PNG and JPEG image formats) - -.. note:: - These features are only possible with **Tensor** images. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np - -import torch -import torchvision.transforms as T -from torchvision.io import read_image - - -plt.rcParams["savefig.bbox"] = 'tight' -torch.manual_seed(1) - - -def show(imgs): - fix, axs = plt.subplots(ncols=len(imgs), squeeze=False) - for i, img in enumerate(imgs): - img = T.ToPILImage()(img.to('cpu')) - axs[0, i].imshow(np.asarray(img)) - axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) - - -#################################### -# The :func:`~torchvision.io.read_image` function allows to read an image and -# directly load it as a tensor - -dog1 = read_image(str(Path('assets') / 'dog1.jpg')) -dog2 = read_image(str(Path('assets') / 'dog2.jpg')) -show([dog1, dog2]) - -#################################### -# Transforming images on GPU -# -------------------------- -# Most transforms natively support tensors on top of PIL images (to visualize -# the effect of the transforms, you may refer to see -# :ref:`sphx_glr_auto_examples_plot_transforms.py`). -# Using tensor images, we can run the transforms on GPUs if cuda is available! - -import torch.nn as nn - -transforms = torch.nn.Sequential( - T.RandomCrop(224), - T.RandomHorizontalFlip(p=0.3), -) - -device = 'cuda' if torch.cuda.is_available() else 'cpu' -dog1 = dog1.to(device) -dog2 = dog2.to(device) - -transformed_dog1 = transforms(dog1) -transformed_dog2 = transforms(dog2) -show([transformed_dog1, transformed_dog2]) - -#################################### -# Scriptable transforms for easier deployment via torchscript -# ----------------------------------------------------------- -# We now show how to combine image transformations and a model forward pass, -# while using ``torch.jit.script`` to obtain a single scripted module. -# -# Let's define a ``Predictor`` module that transforms the input tensor and then -# applies an ImageNet model on it. - -from torchvision.models import resnet18, ResNet18_Weights - - -class Predictor(nn.Module): - - def __init__(self): - super().__init__() - weights = ResNet18_Weights.DEFAULT - self.resnet18 = resnet18(weights=weights, progress=False).eval() - self.transforms = weights.transforms() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - with torch.no_grad(): - x = self.transforms(x) - y_pred = self.resnet18(x) - return y_pred.argmax(dim=1) - - -#################################### -# Now, let's define scripted and non-scripted instances of ``Predictor`` and -# apply it on multiple tensor images of the same size - -predictor = Predictor().to(device) -scripted_predictor = torch.jit.script(predictor).to(device) - -batch = torch.stack([dog1, dog2]).to(device) - -res = predictor(batch) -res_scripted = scripted_predictor(batch) - -#################################### -# We can verify that the prediction of the scripted and non-scripted models are -# the same: - -import json - -with open(Path('assets') / 'imagenet_class_index.json') as labels_file: - labels = json.load(labels_file) - -for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)): - assert pred == pred_scripted - print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}") - -#################################### -# Since the model is scripted, it can be easily dumped on disk and re-used - -import tempfile - -with tempfile.NamedTemporaryFile() as f: - scripted_predictor.save(f.name) - - dumped_scripted_predictor = torch.jit.load(f.name) - res_scripted_dumped = dumped_scripted_predictor(batch) -assert (res_scripted_dumped == res_scripted).all() diff --git a/gallery/plot_video_api.py b/gallery/plot_video_api.py deleted file mode 100644 index d83a508eabe..00000000000 --- a/gallery/plot_video_api.py +++ /dev/null @@ -1,341 +0,0 @@ -""" -======================= -Video API -======================= - -This example illustrates some of the APIs that torchvision offers for -videos, together with the examples on how to build datasets and more. -""" - -#################################### -# 1. Introduction: building a new video object and examining the properties -# ------------------------------------------------------------------------- -# First we select a video to test the object out. For the sake of argument -# we're using one from kinetics400 dataset. -# To create it, we need to define the path and the stream we want to use. - -###################################### -# Chosen video statistics: -# -# - WUzgd7C1pWA.mp4 -# - source: -# - kinetics-400 -# - video: -# - H-264 -# - MPEG-4 AVC (part 10) (avc1) -# - fps: 29.97 -# - audio: -# - MPEG AAC audio (mp4a) -# - sample rate: 48K Hz -# - -import torch -import torchvision -from torchvision.datasets.utils import download_url - -# Download the sample video -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - ".", - "WUzgd7C1pWA.mp4" -) -video_path = "./WUzgd7C1pWA.mp4" - -###################################### -# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form -# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. -# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. -# Firstly, let's get the metadata for our particular video: - -stream = "video" -video = torchvision.io.VideoReader(video_path, stream) -video.get_metadata() - -###################################### -# Here we can see that video has two streams - a video and an audio stream. -# Currently available stream types include ['video', 'audio']. -# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id -# (which are determined by video encoding). -# In this way, if the video container contains multiple streams of the same type, -# users can access the one they want. -# If only stream type is passed, the decoder auto-detects first stream of that type and returns it. - -###################################### -# Let's read all the frames from the video stream. By default, the return value of -# ``next(video_reader)`` is a dict containing the following fields. -# -# The return fields are: -# -# - ``data``: containing a torch.tensor -# - ``pts``: containing a float timestamp of this particular frame - -metadata = video.get_metadata() -video.set_current_stream("audio") - -frames = [] # we are going to save the frames here. -ptss = [] # pts is a presentation timestamp in seconds (float) of each frame -for frame in video: - frames.append(frame['data']) - ptss.append(frame['pts']) - -print("PTS for first five frames ", ptss[:5]) -print("Total number of frames: ", len(frames)) -approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] -print("Approx total number of datapoints we can expect: ", approx_nf) -print("Read data size: ", frames[0].size(0) * len(frames)) - -###################################### -# But what if we only want to read certain time segment of the video? -# That can be done easily using the combination of our ``seek`` function, and the fact that each call -# to next returns the presentation timestamp of the returned frame in seconds. -# -# Given that our implementation relies on python iterators, -# we can leverage itertools to simplify the process and make it more pythonic. -# -# For example, if we wanted to read ten frames from second second: - - -import itertools -video.set_current_stream("video") - -frames = [] # we are going to save the frames here. - -# We seek into a second second of the video and use islice to get 10 frames since -for frame, pts in itertools.islice(video.seek(2), 10): - frames.append(frame) - -print("Total number of frames: ", len(frames)) - -###################################### -# Or if we wanted to read from 2nd to 5th second, -# We seek into a second second of the video, -# then we utilize the itertools takewhile to get the -# correct number of frames: - -video.set_current_stream("video") -frames = [] # we are going to save the frames here. -video = video.seek(2) - -for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video): - frames.append(frame['data']) - -print("Total number of frames: ", len(frames)) -approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] -print("We can expect approx: ", approx_nf) -print("Tensor size: ", frames[0].size()) - -#################################### -# 2. Building a sample read_video function -# ---------------------------------------------------------------------------------------- -# We can utilize the methods above to build the read video function that follows -# the same API to the existing ``read_video`` function. - - -def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True): - if end is None: - end = float("inf") - if end < start: - raise ValueError( - "end time should be larger than start time, got " - f"start time={start} and end time={end}" - ) - - video_frames = torch.empty(0) - video_pts = [] - if read_video: - video_object.set_current_stream("video") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - video_pts.append(frame['pts']) - if len(frames) > 0: - video_frames = torch.stack(frames, 0) - - audio_frames = torch.empty(0) - audio_pts = [] - if read_audio: - video_object.set_current_stream("audio") - frames = [] - for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)): - frames.append(frame['data']) - audio_pts.append(frame['pts']) - if len(frames) > 0: - audio_frames = torch.cat(frames, 0) - - return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata() - - -# Total number of frames should be 327 for video and 523264 datapoints for audio -vf, af, info, meta = example_read_video(video) -print(vf.size(), af.size()) - -#################################### -# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) -# ------------------------------------------------------------------------------------------------------- -# Cool, so now we can use the same principle to make the sample dataset. -# We suggest trying out iterable dataset for this purpose. -# Here, we are going to build an example dataset that reads randomly selected 10 frames of video. - -#################################### -# Make sample dataset -import os -os.makedirs("./dataset", exist_ok=True) -os.makedirs("./dataset/1", exist_ok=True) -os.makedirs("./dataset/2", exist_ok=True) - -#################################### -# Download the videos -from torchvision.datasets.utils import download_url -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true", - "./dataset/1", "WUzgd7C1pWA.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true", - "./dataset/1", - "RATRACE_wave_f_nm_np1_fr_goo_37.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true", - "./dataset/2", - "SOX5yA1l24A.mp4" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g23_c01.avi" -) -download_url( - "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true", - "./dataset/2", - "v_SoccerJuggling_g24_c01.avi" -) - -#################################### -# Housekeeping and utilities -import os -import random - -from torchvision.datasets.folder import make_dataset -from torchvision import transforms as t - - -def _find_classes(dir): - classes = [d.name for d in os.scandir(dir) if d.is_dir()] - classes.sort() - class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)} - return classes, class_to_idx - - -def get_samples(root, extensions=(".mp4", ".avi")): - _, class_to_idx = _find_classes(root) - return make_dataset(root, class_to_idx, extensions=extensions) - -#################################### -# We are going to define the dataset and some basic arguments. -# We assume the structure of the FolderDataset, and add the following parameters: -# -# - ``clip_len``: length of a clip in frames -# - ``frame_transform``: transform for every frame individually -# - ``video_transform``: transform on a video sequence -# -# .. note:: -# We actually add epoch size as using :func:`~torch.utils.data.IterableDataset` -# class allows us to naturally oversample clips or images from each video if needed. - - -class RandomDataset(torch.utils.data.IterableDataset): - def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16): - super(RandomDataset).__init__() - - self.samples = get_samples(root) - - # Allow for temporal jittering - if epoch_size is None: - epoch_size = len(self.samples) - self.epoch_size = epoch_size - - self.clip_len = clip_len - self.frame_transform = frame_transform - self.video_transform = video_transform - - def __iter__(self): - for i in range(self.epoch_size): - # Get random sample - path, target = random.choice(self.samples) - # Get video object - vid = torchvision.io.VideoReader(path, "video") - metadata = vid.get_metadata() - video_frames = [] # video frame buffer - - # Seek and return frames - max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0]) - start = random.uniform(0., max_seek) - for frame in itertools.islice(vid.seek(start), self.clip_len): - video_frames.append(self.frame_transform(frame['data'])) - current_pts = frame['pts'] - # Stack it into a tensor - video = torch.stack(video_frames, 0) - if self.video_transform: - video = self.video_transform(video) - output = { - 'path': path, - 'video': video, - 'target': target, - 'start': start, - 'end': current_pts} - yield output - -#################################### -# Given a path of videos in a folder structure, i.e: -# -# - dataset -# - class 1 -# - file 0 -# - file 1 -# - ... -# - class 2 -# - file 0 -# - file 1 -# - ... -# - ... -# -# We can generate a dataloader and test the dataset. - - -transforms = [t.Resize((112, 112))] -frame_transform = t.Compose(transforms) - -dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) - -#################################### -from torch.utils.data import DataLoader -loader = DataLoader(dataset, batch_size=12) -data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} -for batch in loader: - for i in range(len(batch['path'])): - data['video'].append(batch['path'][i]) - data['start'].append(batch['start'][i].item()) - data['end'].append(batch['end'][i].item()) - data['tensorsize'].append(batch['video'][i].size()) -print(data) - -#################################### -# 4. Data Visualization -# ---------------------------------- -# Example of visualized video - -import matplotlib.pyplot as plt - -plt.figure(figsize=(12, 12)) -for i in range(16): - plt.subplot(4, 4, i + 1) - plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) - plt.axis("off") - -#################################### -# Cleanup the video and dataset: -import os -import shutil -os.remove("./WUzgd7C1pWA.mp4") -shutil.rmtree("./dataset") diff --git a/gallery/transforms/README.rst b/gallery/transforms/README.rst new file mode 100644 index 00000000000..1b8b1b08155 --- /dev/null +++ b/gallery/transforms/README.rst @@ -0,0 +1,4 @@ +.. _transforms_gallery: + +Transforms +---------- diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py new file mode 100644 index 00000000000..bc8de0d2ad1 --- /dev/null +++ b/gallery/transforms/helpers.py @@ -0,0 +1,56 @@ +import matplotlib.pyplot as plt +import torch +from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks +from torchvision import tv_tensors +from torchvision.transforms import v2 +from torchvision.transforms.v2 import functional as F + + +def plot(imgs, row_title=None, bbox_width=3, **imshow_kwargs): + if not isinstance(imgs[0], list): + # Make a 2d grid even if there's just 1 row + imgs = [imgs] + + num_rows = len(imgs) + num_cols = len(imgs[0]) + _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False) + for row_idx, row in enumerate(imgs): + for col_idx, img in enumerate(row): + boxes = None + masks = None + if isinstance(img, tuple): + img, target = img + if isinstance(target, dict): + boxes = target.get("boxes") + masks = target.get("masks") + elif isinstance(target, tv_tensors.BoundingBoxes): + boxes = target + + # Conversion necessary because draw_bounding_boxes() only + # work with this specific format. + if tv_tensors.is_rotated_bounding_format(boxes.format): + boxes = v2.ConvertBoundingBoxFormat("xyxyxyxy")(boxes) + else: + raise ValueError(f"Unexpected target type: {type(target)}") + img = F.to_image(img) + if img.dtype.is_floating_point and img.min() < 0: + # Poor man's re-normalization for the colors to be OK-ish. This + # is useful for images coming out of Normalize() + img -= img.min() + img /= img.max() + + img = F.to_dtype(img, torch.uint8, scale=True) + if boxes is not None: + img = draw_bounding_boxes(img, boxes, colors="yellow", width=bbox_width) + if masks is not None: + img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65) + + ax = axs[row_idx, col_idx] + ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + + if row_title is not None: + for row_idx in range(num_rows): + axs[row_idx, 0].set(ylabel=row_title[row_idx]) + + plt.tight_layout() diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py new file mode 100644 index 00000000000..d1bd9455bfb --- /dev/null +++ b/gallery/transforms/plot_custom_transforms.py @@ -0,0 +1,200 @@ +""" +=================================== +How to write your own v2 transforms +=================================== + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +This guide explains how to write transforms that are compatible with the +torchvision transforms V2 API. +""" + +# %% +from typing import Any, Dict, List + +import torch +from torchvision import tv_tensors +from torchvision.transforms import v2 + + +# %% +# Just create a ``nn.Module`` and override the ``forward`` method +# =============================================================== +# +# In most cases, this is all you're going to need, as long as you already know +# the structure of the input that your transform will expect. For example if +# you're just doing image classification, your transform will typically accept a +# single image as input, or a ``(img, label)`` input. So you can just hard-code +# your ``forward`` method to accept just that, e.g. +# +# .. code:: python +# +# class MyCustomTransform(torch.nn.Module): +# def forward(self, img, label): +# # Do some transformations +# return new_img, new_label +# +# .. note:: +# +# This means that if you have a custom transform that is already compatible +# with the V1 transforms (those in ``torchvision.transforms``), it will +# still work with the V2 transforms without any change! +# +# We will illustrate this more completely below with a typical detection case, +# where our samples are just images, bounding boxes and labels: + +class MyCustomTransform(torch.nn.Module): + def forward(self, img, bboxes, label): # we assume inputs are always structured like this + print( + f"I'm transforming an image of shape {img.shape} " + f"with bboxes = {bboxes}\n{label = }" + ) + # Do some transformations. Here, we're just passing though the input + return img, bboxes, label + + +transforms = v2.Compose([ + MyCustomTransform(), + v2.RandomResizedCrop((224, 224), antialias=True), + v2.RandomHorizontalFlip(p=1), + v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1]) +]) + +H, W = 256, 256 +img = torch.rand(3, H, W) +bboxes = tv_tensors.BoundingBoxes( + torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]), + format="XYXY", + canvas_size=(H, W) +) +label = 3 + +out_img, out_bboxes, out_label = transforms(img, bboxes, label) +# %% +print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }") +# %% +# .. note:: +# While working with TVTensor classes in your code, make sure to +# familiarize yourself with this section: +# :ref:`tv_tensor_unwrapping_behaviour` +# +# Supporting arbitrary input structures +# ===================================== +# +# In the section above, we have assumed that you already know the structure of +# your inputs and that you're OK with hard-coding this expected structure in +# your code. If you want your custom transforms to be as flexible as possible, +# this can be a bit limiting. +# +# A key feature of the builtin Torchvision V2 transforms is that they can accept +# arbitrary input structure and return the same structure as output (with +# transformed entries). For example, transforms can accept a single image, or a +# tuple of ``(img, label)``, or an arbitrary nested dictionary as input. Here's +# an example on the built-in transform :class:`~torchvision.transforms.v2.RandomHorizontalFlip`: + +structured_input = { + "img": img, + "annotations": (bboxes, label), + "something that will be ignored": (1, "hello"), + "another tensor that is ignored": torch.arange(10), +} +structured_output = v2.RandomHorizontalFlip(p=1)(structured_input) + +assert isinstance(structured_output, dict) +assert structured_output["something that will be ignored"] == (1, "hello") +assert (structured_output["another tensor that is ignored"] == torch.arange(10)).all() +print(f"The input bboxes are:\n{structured_input['annotations'][0]}") +print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}") + +# %% +# Basics: override the `transform()` method +# ----------------------------------------- +# +# In order to support arbitrary inputs in your custom transform, you will need +# to inherit from :class:`~torchvision.transforms.v2.Transform` and override the +# `.transform()` method (not the `forward()` method!). Below is a basic example: + + +class MyCustomTransform(v2.Transform): + def transform(self, inpt: Any, params: Dict[str, Any]): + if type(inpt) == torch.Tensor: + print(f"I'm transforming an image of shape {inpt.shape}") + return inpt + 1 # dummy transformation + elif isinstance(inpt, tv_tensors.BoundingBoxes): + print(f"I'm transforming bounding boxes! {inpt.canvas_size = }") + return tv_tensors.wrap(inpt + 100, like=inpt) # dummy transformation + + +my_custom_transform = MyCustomTransform() +structured_output = my_custom_transform(structured_input) + +assert isinstance(structured_output, dict) +assert structured_output["something that will be ignored"] == (1, "hello") +assert (structured_output["another tensor that is ignored"] == torch.arange(10)).all() +print(f"The input bboxes are:\n{structured_input['annotations'][0]}") +print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}") + +# %% +# An important thing to note is that when we call ``my_custom_transform`` on +# ``structured_input``, the input is flattened and then each individual part is +# passed to ``transform()``. That is, ``transform()``` receives the input image, +# then the bounding boxes, etc. Within ``transform()``, you can decide how to +# transform each input, based on their type. +# +# If you're curious why the other tensor (``torch.arange()``) didn't get passed +# to ``transform()``, see :ref:`this note ` for more +# details. +# +# Advanced: The ``make_params()`` method +# -------------------------------------- +# +# The ``make_params()`` method is called internally before calling +# ``transform()`` on each input. This is typically useful to generate random +# parameter values. In the example below, we use it to randomly apply the +# transformation with a probability of 0.5 + + +class MyRandomTransform(MyCustomTransform): + def __init__(self, p=0.5): + self.p = p + super().__init__() + + def make_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: + apply_transform = (torch.rand(size=(1,)) < self.p).item() + params = dict(apply_transform=apply_transform) + return params + + def transform(self, inpt: Any, params: Dict[str, Any]): + if not params["apply_transform"]: + print("Not transforming anything!") + return inpt + else: + return super().transform(inpt, params) + + +my_random_transform = MyRandomTransform() + +torch.manual_seed(0) +_ = my_random_transform(structured_input) # transforms +_ = my_random_transform(structured_input) # doesn't transform + +# %% +# +# .. note:: +# +# It's important for such random parameter generation to happen within +# ``make_params()`` and not within ``transform()``, so that for a given +# transform call, the same RNG applies to all the inputs in the same way. If +# we were to perform the RNG within ``transform()``, we would risk e.g. +# transforming the image while *not* transforming the bounding boxes. +# +# The ``make_params()`` method takes the list of all the inputs as parameter +# (each of the elements in this list will later be pased to ``transform()``). +# You can use ``flat_inputs`` to e.g. figure out the dimensions on the input, +# using :func:`~torchvision.transforms.v2.query_chw` or +# :func:`~torchvision.transforms.v2.query_size`. +# +# ``make_params()`` should return a dict (or actually, anything you want) that +# will then be passed to ``transform()``. diff --git a/gallery/transforms/plot_custom_tv_tensors.py b/gallery/transforms/plot_custom_tv_tensors.py new file mode 100644 index 00000000000..9b113901461 --- /dev/null +++ b/gallery/transforms/plot_custom_tv_tensors.py @@ -0,0 +1,119 @@ +""" +==================================== +How to write your own TVTensor class +==================================== + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +This guide is intended for advanced users and downstream library maintainers. We explain how to +write your own TVTensor class, and how to make it compatible with the built-in +Torchvision v2 transforms. Before continuing, make sure you have read +:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`. +""" + +# %% +import torch +from torchvision import tv_tensors +from torchvision.transforms import v2 + +# %% +# We will create a very simple class that just inherits from the base +# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover +# what you need to know to implement your more elaborate uses-cases. If you need +# to create a class that carries meta-data, take a look at how the +# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented +# `_. + + +class MyTVTensor(tv_tensors.TVTensor): + pass + + +my_dp = MyTVTensor([1, 2, 3]) +my_dp + +# %% +# Now that we have defined our custom TVTensor class, we want it to be +# compatible with the built-in torchvision transforms, and the functional API. +# For that, we need to implement a kernel which performs the core of the +# transformation, and then "hook" it to the functional that we want to support +# via :func:`~torchvision.transforms.v2.functional.register_kernel`. +# +# We illustrate this process below: we create a kernel for the "horizontal flip" +# operation of our MyTVTensor class, and register it to the functional API. + +from torchvision.transforms.v2 import functional as F + + +@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor) +def hflip_my_tv_tensor(my_dp, *args, **kwargs): + print("Flipping!") + out = my_dp.flip(-1) + return tv_tensors.wrap(out, like=my_dp) + + +# %% +# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see +# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now, +# we will explain it below in :ref:`param_forwarding`. +# +# .. note:: +# +# In our call to ``register_kernel`` above we used a string +# ``functional="hflip"`` to refer to the functional we want to hook into. We +# could also have used the functional *itself*, i.e. +# ``@register_kernel(functional=F.hflip, ...)``. +# +# Now that we have registered our kernel, we can call the functional API on a +# ``MyTVTensor`` instance: + +my_dp = MyTVTensor(torch.rand(3, 256, 256)) +_ = F.hflip(my_dp) + +# %% +# And we can also use the +# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally: +t = v2.RandomHorizontalFlip(p=1) +_ = t(my_dp) + +# %% +# .. note:: +# +# We cannot register a kernel for a transform class, we can only register a +# kernel for a **functional**. The reason we can't register a transform +# class is because one transform may internally rely on more than one +# functional, so in general we can't register a single kernel for a given +# class. +# +# .. _param_forwarding: +# +# Parameter forwarding, and ensuring future compatibility of your kernels +# ----------------------------------------------------------------------- +# +# The functional API that you're hooking into is public and therefore +# **backward** compatible: we guarantee that the parameters of these functionals +# won't be removed or renamed without a proper deprecation cycle. However, we +# don't guarantee **forward** compatibility, and we may add new parameters in +# the future. +# +# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter +# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you +# already defined and registered your own kernel as + +def hflip_my_tv_tensor(my_dp): # noqa + print("Flipping!") + out = my_dp.flip(-1) + return tv_tensors.wrap(out, like=my_dp) + + +# %% +# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to +# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't +# accept it. +# +# For this reason, we recommend to always define your kernels with +# ``*args, **kwargs`` in their signature, as done above. This way, your kernel +# will be able to accept any new parameter that we may add in the future. +# (Technically, adding `**kwargs` only should be enough). diff --git a/gallery/transforms/plot_cutmix_mixup.py b/gallery/transforms/plot_cutmix_mixup.py new file mode 100644 index 00000000000..222be0ff359 --- /dev/null +++ b/gallery/transforms/plot_cutmix_mixup.py @@ -0,0 +1,150 @@ + +""" +=========================== +How to use CutMix and MixUp +=========================== + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +:class:`~torchvision.transforms.v2.CutMix` and +:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies +that can improve classification accuracy. + +These transforms are slightly different from the rest of the Torchvision +transforms, because they expect +**batches** of samples as input, not individual images. In this example we'll +explain how to use them: after the ``DataLoader``, or as part of a collation +function. +""" + +# %% +import torch +from torchvision.datasets import FakeData +from torchvision.transforms import v2 + + +NUM_CLASSES = 100 + +# %% +# Pre-processing pipeline +# ----------------------- +# +# We'll use a simple but typical image classification pipeline: + +preproc = v2.Compose([ + v2.PILToTensor(), + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomHorizontalFlip(p=0.5), + v2.ToDtype(torch.float32, scale=True), # to float32 in [0, 1] + v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), # typically from ImageNet +]) + +dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc) + +img, label = dataset[0] +print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }") + +# %% +# +# One important thing to note is that neither CutMix nor MixUp are part of this +# pre-processing pipeline. We'll add them a bit later once we define the +# DataLoader. Just as a refresher, this is what the DataLoader and training loop +# would look like if we weren't using CutMix or MixUp: + +from torch.utils.data import DataLoader + +dataloader = DataLoader(dataset, batch_size=4, shuffle=True) + +for images, labels in dataloader: + print(f"{images.shape = }, {labels.shape = }") + print(labels.dtype) + # + break +# %% + +# %% +# Where to use MixUp and CutMix +# ----------------------------- +# +# After the DataLoader +# ^^^^^^^^^^^^^^^^^^^^ +# +# Now let's add CutMix and MixUp. The simplest way to do this right after the +# DataLoader: the Dataloader has already batched the images and labels for us, +# and this is exactly what these transforms expect as input: + +dataloader = DataLoader(dataset, batch_size=4, shuffle=True) + +cutmix = v2.CutMix(num_classes=NUM_CLASSES) +mixup = v2.MixUp(num_classes=NUM_CLASSES) +cutmix_or_mixup = v2.RandomChoice([cutmix, mixup]) + +for images, labels in dataloader: + print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }") + images, labels = cutmix_or_mixup(images, labels) + print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }") + + # + break +# %% +# +# Note how the labels were also transformed: we went from a batched label of +# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The +# transformed labels can still be passed as-is to a loss function like +# :func:`torch.nn.functional.cross_entropy`. +# +# As part of the collation function +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Passing the transforms after the DataLoader is the simplest way to use CutMix +# and MixUp, but one disadvantage is that it does not take advantage of the +# DataLoader multi-processing. For that, we can pass those transforms as part of +# the collation function (refer to the `PyTorch docs +# `_ to learn +# more about collation). + +from torch.utils.data import default_collate + + +def collate_fn(batch): + return cutmix_or_mixup(*default_collate(batch)) + + +dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn) + +for images, labels in dataloader: + print(f"{images.shape = }, {labels.shape = }") + # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader! + # + break + +# %% +# Non-standard input format +# ------------------------- +# +# So far we've used a typical sample structure where we pass ``(images, +# labels)`` as inputs. MixUp and CutMix will magically work by default with most +# common sample structures: tuples where the second parameter is a tensor label, +# or dict with a "label[s]" key. Look at the documentation of the +# ``labels_getter`` parameter for more details. +# +# If your samples have a different structure, you can still use CutMix and MixUp +# by passing a callable to the ``labels_getter`` parameter. For example: + +batch = { + "imgs": torch.rand(4, 3, 224, 224), + "target": { + "classes": torch.randint(0, NUM_CLASSES, size=(4,)), + "some_other_key": "this is going to be passed-through" + } +} + + +def labels_getter(batch): + return batch["target"]["classes"] + + +out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch) +print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }") diff --git a/gallery/transforms/plot_rotated_box_transforms.py b/gallery/transforms/plot_rotated_box_transforms.py new file mode 100644 index 00000000000..7c6e3a559df --- /dev/null +++ b/gallery/transforms/plot_rotated_box_transforms.py @@ -0,0 +1,195 @@ +""" +=============================================================== +Transforms on Rotated Bounding Boxes +=============================================================== + +This example illustrates how to define and use rotated bounding boxes. + +.. note:: + Support for rotated bounding boxes was released in TorchVision 0.23 and is + currently a BETA feature. We don't expect the API to change, but there may + be some rare edge-cases. If you find any issues, please report them on + our bug tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue + +First, a bit of setup code: +""" + +# %% +from PIL import Image +from pathlib import Path +import matplotlib.pyplot as plt + + +import torch +from torchvision.tv_tensors import BoundingBoxes +from torchvision.transforms import v2 +from helpers import plot + +plt.rcParams["figure.figsize"] = [10, 5] +plt.rcParams["savefig.bbox"] = "tight" + +# if you change the seed, make sure that the randomly-applied transforms +# properly show that the image can be both transformed and *not* transformed! +torch.manual_seed(0) + +# If you're trying to run that on Colab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +orig_img = Image.open(Path('../assets') / 'leaning_tower.jpg') + +# %% +# Creating a Rotated Bounding Box +# ------------------------------- +# Rotated bounding boxes are created by instantiating the +# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the ``format`` +# parameter of the constructor that determines if a bounding box is rotated or +# not. In this instance, we use the CXCYWHR +# :attr:`~torchvision.tv_tensors.BoundingBoxFormat`. The first two values are +# the X and Y coordinates of the center of the bounding box. The next two +# values are the width and height of the bounding box, and the last value is the +# rotation of the bounding box, in degrees. + + +orig_box = BoundingBoxes( + [ + [860.0, 1100, 570, 1840, -7], + ], + format="CXCYWHR", + canvas_size=(orig_img.size[1], orig_img.size[0]), +) + +plot([(orig_img, orig_box)], bbox_width=10) + +# %% +# Transforms illustrations +# ------------------------ +# +# Using :class:`~torchvision.transforms.RandomRotation`: +rotater = v2.RandomRotation(degrees=(0, 180), expand=True) +rotated_imgs = [rotater((orig_img, orig_box)) for _ in range(4)] +plot([(orig_img, orig_box)] + rotated_imgs, bbox_width=10) + +# %% +# Using :class:`~torchvision.transforms.Pad`: +padded_imgs_and_boxes = [ + v2.Pad(padding=padding)(orig_img, orig_box) + for padding in (30, 50, 100, 200) +] +plot([(orig_img, orig_box)] + padded_imgs_and_boxes, bbox_width=10) + +# %% +# Using :class:`~torchvision.transforms.Resize`: +resized_imgs = [ + v2.Resize(size=size)(orig_img, orig_box) + for size in (30, 50, 100, orig_img.size) +] +plot([(orig_img, orig_box)] + resized_imgs, bbox_width=5) + +# %% +# Note that the bounding box looking bigger in the images with less pixels is +# an artifact, not reality. That is merely the rasterised representation of the +# bounding box's boundaries appearing bigger because we specify a fixed width of +# that rasterized line. When the image is, say, only 30 pixels wide, a +# line that is 3 pixels wide is relatively large. +# +# .. _clamping_mode_tuto: +# +# Clamping Mode, and its effect on transforms +# ------------------------------------------- +# +# Some transforms, such as :class:`~torchvision.transforms.CenterCrop`, may +# result in having the transformed bounding box partially outside of the +# transformed (cropped) image. In general, this may happen on most of the +# :ref:`geometric transforms `. +# +# In such cases, the bounding box is clamped to the transformed image size based +# on its ``clamping_mode`` attribute. There are three values for +# ``clamping_mode``, which determines how the box is clamped after a +# transformation: +# +# - ``None``: No clamping is applied, and the bounding box may be partially +# outside of the image. +# - `"hard"`: The box is clamped to the image size, such that all its corners +# are within the image canvas. This potentially results in a loss of +# information, and it can lead to unintuitive resuts. But may be necessary +# for some applications e.g. if the model doesn't support boxes outside of +# their image. +# - `"soft"`: . This is an intermediate mode between ``None`` and "hard": the +# box is clamped, but not as strictly as in "hard" mode. Some box dimensions +# may still be outside of the image. This is the default when constucting +# :class:`~torchvision.tv_tensors.BoundingBoxes`. +# +# .. note:: +# +# For axis-aligned bounding boxes, the `"soft"` and `"hard"` modes behave +# the same, as the bounding box is always clamped to the image size. +# +# Let's illustrate the clamping modes with +# :class:`~torchvision.transforms.CenterCrop` transform: + +assert orig_box.clamping_mode == "soft" + +box_hard_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode="hard") + +box_no_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode=None) + +crop_sizes = (800, 1200, 2000, orig_img.size) +soft_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, orig_box) + for size in crop_sizes +] + +hard_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, box_hard_clamping) + for size in crop_sizes +] + +no_clamping_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, box_no_clamping) + for size in crop_sizes +] + +plot([[(orig_img, box_hard_clamping)] + hard_center_crops_and_boxes, + [(orig_img, orig_box)] + soft_center_crops_and_boxes, + [(orig_img, box_no_clamping)] + no_clamping_center_crops_and_boxes], + bbox_width=10) + +# %% +# The plot above shows the "hard" clamping mode, "soft" and ``None``, in this +# order. While "soft" and ``None`` result in similar plots, they do not lead to +# the exact same clamped boxes. The non-clamped boxes will show dimensions that are further away from the image: +print("boxes with soft clamping:") +print(soft_center_crops_and_boxes) +print() +print("boxes with no clamping:") +print(no_clamping_center_crops_and_boxes) + +# %% +# +# Setting the clamping mode +# -------------------------- +# +# The ``clamping_mode`` attribute, which determines the clamping strategy that +# is applied to a box, can be set in different ways: +# +# - When constructing the bounding box with its +# :class:`~torchvision.tv_tensors.BoundingBoxes` constructor, as done in the example above. +# - By directly setting the attribute on an existing instance, e.g. ``boxes.clamping_mode = "hard"``. +# - By calling the :class:`~torchvision.transforms.v2.SetClampingMode` transform. +# +# Also, remember that you can always clamp the bounding box manually by +# calling the :meth:`~torchvision.transforms.v2.ClampBoundingBoxes` transform! +# Here's an example illustrating all of these option: + +t = v2.Compose([ + v2.CenterCrop(size=(800,)), # clamps according to the current clamping_mode + # attribute, in this case set by the constructor + v2.SetClampingMode(None), # sets the clamping_mode attribute for future transforms + v2.Pad(padding=3), # clamps according to the current clamping_mode + # i.e. ``None`` + v2.ClampBoundingBoxes(clamping_mode="soft"), # clamps with "soft" mode. +]) + +out_img, out_box = t(orig_img, orig_box) +plot([(orig_img, orig_box), (out_img, out_box)], bbox_width=10) + +# %% diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py new file mode 100644 index 00000000000..765d7ad51e5 --- /dev/null +++ b/gallery/transforms/plot_transforms_e2e.py @@ -0,0 +1,181 @@ +""" +=============================================================== +Transforms v2: End-to-end object detection/segmentation example +=============================================================== + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +Object detection and segmentation tasks are natively supported: +``torchvision.transforms.v2`` enables jointly transforming images, videos, +bounding boxes, and masks. + +This example showcases an end-to-end instance segmentation training case using +Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and +``torchvision.transforms.v2``. Everything covered here can be applied similarly +to object detection or semantic segmentation tasks. +""" + +# %% +import pathlib + +import torch +import torch.utils.data + +from torchvision import models, datasets, tv_tensors +from torchvision.transforms import v2 + +torch.manual_seed(0) + +# This loads fake data for illustration purposes of this example. In practice, you'll have +# to replace this with the proper data. +# If you're trying to run that on Colab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +ROOT = pathlib.Path("../assets") / "coco" +IMAGES_PATH = str(ROOT / "images") +ANNOTATIONS_PATH = str(ROOT / "instances.json") +from helpers import plot + + +# %% +# Dataset preparation +# ------------------- +# +# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently +# returns. + +dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH) + +sample = dataset[0] +img, target = sample +print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }") + + +# %% +# Torchvision datasets preserve the data structure and types as it was intended +# by the datasets authors. So by default, the output structure may not always be +# compatible with the models or the transforms. +# +# To overcome that, we can use the +# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For +# :class:`~torchvision.datasets.CocoDetection`, this changes the target +# structure to a single dictionary of lists: + +dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks")) + +sample = dataset[0] +img, target = sample +print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }") +print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }") + +# %% +# We used the ``target_keys`` parameter to specify the kind of output we're +# interested in. Our dataset now returns a target which is dict where the values +# are :ref:`TVTensors ` (all are :class:`torch.Tensor` +# subclasses). We're dropped all unncessary keys from the previous output, but +# if you need any of the original keys e.g. "image_id", you can still ask for +# it. +# +# .. note:: +# +# If you just want to do detection, you don't need and shouldn't pass +# "masks" in ``target_keys``: if masks are present in the sample, they will +# be transformed, slowing down your transformations unnecessarily. +# +# As baseline, let's have a look at a sample without transformations: + +plot([dataset[0], dataset[1]]) + + +# %% +# Transforms +# ---------- +# +# Let's now define our pre-processing transforms. All the transforms know how +# to handle images, bounding boxes and masks when relevant. +# +# Transforms are typically passed as the ``transforms`` parameter of the +# dataset so that they can leverage multi-processing from the +# :class:`torch.utils.data.DataLoader`. + +transforms = v2.Compose( + [ + v2.ToImage(), + v2.RandomPhotometricDistort(p=1), + v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}), + v2.RandomIoUCrop(), + v2.RandomHorizontalFlip(p=1), + v2.SanitizeBoundingBoxes(), + v2.ToDtype(torch.float32, scale=True), + ] +) + +dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms) +dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"]) + +# %% +# A few things are worth noting here: +# +# - We're converting the PIL image into a +# :class:`~torchvision.transforms.v2.Image` object. This isn't strictly +# necessary, but relying on Tensors (here: a Tensor subclass) will +# :ref:`generally be faster `. +# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to +# make sure we remove degenerate bounding boxes, as well as their +# corresponding labels and masks. +# :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed +# at least once at the end of a detection pipeline; it is particularly +# critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used. +# +# Let's look how the sample looks like with our augmentation pipeline in place: + +# sphinx_gallery_thumbnail_number = 2 +plot([dataset[0], dataset[1]]) + + +# %% +# We can see that the color of the images were distorted, zoomed in or out, and flipped. +# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training. +# +# Data loading and training loop +# ------------------------------ +# +# Below we're using Mask-RCNN which is an instance segmentation model, but +# everything we've covered in this tutorial also applies to object detection and +# semantic segmentation tasks. + +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=2, + # We need a custom collation function here, since the object detection + # models expect a sequence of images and target dictionaries. The default + # collation function tries to torch.stack() the individual elements, + # which fails in general for object detection, because the number of bounding + # boxes varies between the images of the same batch. + collate_fn=lambda batch: tuple(zip(*batch)), +) + +model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train() + +for imgs, targets in data_loader: + loss_dict = model(imgs, targets) + # Put your training logic here + + print(f"{[img.shape for img in imgs] = }") + print(f"{[type(target) for target in targets] = }") + for name, loss_val in loss_dict.items(): + print(f"{name:<20}{loss_val:.3f}") + +# %% +# Training References +# ------------------- +# +# From there, you can check out the `torchvision references +# `_ where you'll find +# the actual training scripts we use to train our models. +# +# **Disclaimer** The code in our references is more complex than what you'll +# need for your own use-cases: this is because we're supporting different +# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and +# v2). So don't be afraid to simplify and only keep what you need. diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py new file mode 100644 index 00000000000..d7fb36a4a77 --- /dev/null +++ b/gallery/transforms/plot_transforms_getting_started.py @@ -0,0 +1,268 @@ +""" +================================== +Getting started with transforms v2 +================================== + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +This example illustrates all of what you need to know to get started with the +new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like +image classification, and more advanced ones like object detection / +segmentation. +""" + +# %% +# First, a bit of setup +from pathlib import Path +import torch +import matplotlib.pyplot as plt +plt.rcParams["savefig.bbox"] = 'tight' + +from torchvision.transforms import v2 +from torchvision.io import decode_image + +torch.manual_seed(1) + +# If you're trying to run that on Colab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +from helpers import plot +img = decode_image(str(Path('../assets') / 'astronaut.jpg')) +print(f"{type(img) = }, {img.dtype = }, {img.shape = }") + +# %% +# The basics +# ---------- +# +# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in +# fact, most of them are): instantiate a transform, pass an input, get a +# transformed output: + +transform = v2.RandomCrop(size=(224, 224)) +out = transform(img) + +plot([img, out]) + +# %% +# I just want to do image classification +# -------------------------------------- +# +# If you just care about image classification, things are very simple. A basic +# classification pipeline may look like this: + +transforms = v2.Compose([ + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomHorizontalFlip(p=0.5), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), +]) +out = transforms(img) + +plot([img, out]) + +# %% +# Such transformation pipeline is typically passed as the ``transform`` argument +# to the :ref:`Datasets `, e.g. ``ImageNet(..., +# transform=transforms)``. +# +# That's pretty much all there is. From there, read through our :ref:`main docs +# ` to learn more about recommended practices and conventions, or +# explore more :ref:`examples ` e.g. how to use augmentation +# transforms like :ref:`CutMix and MixUp +# `. +# +# .. note:: +# +# If you're already relying on the ``torchvision.transforms`` v1 API, +# we recommend to :ref:`switch to the new v2 transforms`. It's +# very easy: the v2 transforms are fully compatible with the v1 API, so you +# only need to change the import! +# +# Videos, boxes, masks, keypoints +# ------------------------------- +# +# The Torchvision transforms in the ``torchvision.transforms.v2`` namespace +# support tasks beyond image classification: they can also transform rotated or +# axis-aligned bounding boxes, segmentation / detection masks, videos, and +# keypoints. +# +# Let's briefly look at a detection example with bounding boxes. + +from torchvision import tv_tensors # we'll describe this a bit later, bare with us + +boxes = tv_tensors.BoundingBoxes( + [ + [15, 10, 370, 510], + [275, 340, 510, 510], + [130, 345, 210, 425] + ], + format="XYXY", canvas_size=img.shape[-2:]) + +transforms = v2.Compose([ + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomPhotometricDistort(p=1), + v2.RandomHorizontalFlip(p=1), +]) +out_img, out_boxes = transforms(img, boxes) +print(type(boxes), type(out_boxes)) + +plot([(img, boxes), (out_img, out_boxes)]) + +# %% +# +# The example above focuses on object detection. But if we had masks +# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic +# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have +# passed them to the transforms in exactly the same way. +# +# By now you likely have a few questions: what are these TVTensors, how do we +# use them, and what is the expected input/output of those transforms? We'll +# answer these in the next sections. + +# %% +# +# .. _what_are_tv_tensors: +# +# What are TVTensors? +# -------------------- +# +# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are +# :class:`~torchvision.tv_tensors.Image`, +# :class:`~torchvision.tv_tensors.BoundingBoxes`, +# :class:`~torchvision.tv_tensors.Mask`, +# :class:`~torchvision.tv_tensors.Video`, and +# :class:`~torchvision.tv_tensors.KeyPoints`. +# +# TVTensors look and feel just like regular tensors - they **are** tensors. +# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` +# or any ``torch.*`` operator will also work on a TVTensor: + +img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8)) + +print(f"{isinstance(img_dp, torch.Tensor) = }") +print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }") + +# %% +# These TVTensor classes are at the core of the transforms: in order to +# transform a given input, the transforms first look at the **class** of the +# object, and dispatch to the appropriate implementation accordingly. +# +# You don't need to know much more about TVTensors at this point, but advanced +# users who want to learn more can refer to +# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`. +# +# What do I pass as input? +# ------------------------ +# +# Above, we've seen two examples: one where we passed a single image as input +# i.e. ``out = transforms(img)``, and one where we passed both an image and +# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``. +# +# In fact, transforms support **arbitrary input structures**. The input can be a +# single image, a tuple, an arbitrarily nested dictionary... pretty much +# anything. The same structure will be returned as output. Below, we use the +# same detection transforms, but pass a tuple (image, target_dict) as input and +# we're getting the same structure as output: + +target = { + "boxes": boxes, + "labels": torch.arange(boxes.shape[0]), + "this_is_ignored": ("arbitrary", {"structure": "!"}) +} + +# Re-using the transforms and definitions from above. +out_img, out_target = transforms(img, target) + +# sphinx_gallery_thumbnail_number = 4 +plot([(img, target["boxes"]), (out_img, out_target["boxes"])]) +print(f"{out_target['this_is_ignored']}") + +# %% +# We passed a tuple so we get a tuple back, and the second element is the +# tranformed target dict. Transforms don't really care about the structure of +# the input; as mentioned above, they only care about the **type** of the +# objects and transforms them accordingly. +# +# *Foreign* objects like strings or ints are simply passed-through. This can be +# useful e.g. if you want to associate a path with every single sample when +# debugging! +# +# .. _passthrough_heuristic: +# +# .. note:: +# +# **Disclaimer** This note is slightly advanced and can be safely skipped on +# a first read. +# +# Pure :class:`torch.Tensor` objects are, in general, treated as images (or +# as videos for video-specific transforms). Indeed, you may have noticed +# that in the code above we haven't used the +# :class:`~torchvision.tv_tensors.Image` class at all, and yet our images +# got transformed properly. Transforms follow the following logic to +# determine whether a pure Tensor should be treated as an image (or video), +# or just ignored: +# +# * If there is an :class:`~torchvision.tv_tensors.Image`, +# :class:`~torchvision.tv_tensors.Video`, +# or :class:`PIL.Image.Image` instance in the input, all other pure +# tensors are passed-through. +# * If there is no :class:`~torchvision.tv_tensors.Image` or +# :class:`~torchvision.tv_tensors.Video` instance, only the first pure +# :class:`torch.Tensor` will be transformed as image or video, while all +# others will be passed-through. Here "first" means "first in a depth-wise +# traversal". +# +# This is what happened in the detection example above: the first pure +# tensor was the image so it got transformed properly, and all other pure +# tensor instances like the ``labels`` were passed-through (although labels +# can still be transformed by some transforms like +# :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!). +# +# .. _transforms_datasets_intercompatibility: +# +# Transforms and Datasets intercompatibility +# ------------------------------------------ +# +# Roughly speaking, the output of the datasets must correspond to the input of +# the transforms. How to do that depends on whether you're using the torchvision +# :ref:`built-in datatsets `, or your own custom datasets. +# +# Using built-in datasets +# ^^^^^^^^^^^^^^^^^^^^^^^ +# +# If you're just doing image classification, you don't need to do anything. Just +# use ``transform`` argument of the dataset e.g. ``ImageNet(..., +# transform=transforms)`` and you're good to go. +# +# Torchvision also supports datasets for object detection or segmentation like +# :class:`torchvision.datasets.CocoDetection`. Those datasets predate +# the existence of the :mod:`torchvision.transforms.v2` module and of the +# TVTensors, so they don't return TVTensors out of the box. +# +# An easy way to force those datasets to return TVTensors and to make them +# compatible with v2 transforms is to use the +# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function: +# +# .. code-block:: python +# +# from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2 +# +# dataset = CocoDetection(..., transforms=my_transforms) +# dataset = wrap_dataset_for_transforms_v2(dataset) +# # Now the dataset returns TVTensors! +# +# Using your own datasets +# ^^^^^^^^^^^^^^^^^^^^^^^ +# +# If you have a custom dataset, then you'll need to convert your objects into +# the appropriate TVTensor classes. Creating TVTensor instances is very easy, +# refer to :ref:`tv_tensor_creation` for more details. +# +# There are two main places where you can implement that conversion logic: +# +# - At the end of the datasets's ``__getitem__`` method, before returning the +# sample (or by sub-classing the dataset). +# - As the very first step of your transforms pipeline +# +# Either way, the logic will depend on your specific dataset. diff --git a/gallery/plot_transforms.py b/gallery/transforms/plot_transforms_illustrations.py similarity index 54% rename from gallery/plot_transforms.py rename to gallery/transforms/plot_transforms_illustrations.py index c6e44a14e22..0c1f3b40021 100644 --- a/gallery/plot_transforms.py +++ b/gallery/transforms/plot_transforms_illustrations.py @@ -3,317 +3,329 @@ Illustration of transforms ========================== -This example illustrates the various transforms available in :ref:`the -torchvision.transforms module `. +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + +This example illustrates some of the various transforms available in :ref:`the +torchvision.transforms.v2 module `. """ +# %% # sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png" from PIL import Image from pathlib import Path import matplotlib.pyplot as plt -import numpy as np import torch -import torchvision.transforms as T - +from torchvision.transforms import v2 plt.rcParams["savefig.bbox"] = 'tight' -orig_img = Image.open(Path('assets') / 'astronaut.jpg') + # if you change the seed, make sure that the randomly-applied transforms # properly show that the image can be both transformed and *not* transformed! torch.manual_seed(0) - -def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs): - if not isinstance(imgs[0], list): - # Make a 2d grid even if there's just 1 row - imgs = [imgs] - - num_rows = len(imgs) - num_cols = len(imgs[0]) + with_orig - fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False) - for row_idx, row in enumerate(imgs): - row = [orig_img] + row if with_orig else row - for col_idx, img in enumerate(row): - ax = axs[row_idx, col_idx] - ax.imshow(np.asarray(img), **imshow_kwargs) - ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) - - if with_orig: - axs[0, 0].set(title='Original image') - axs[0, 0].title.set_size(8) - if row_title is not None: - for row_idx in range(num_rows): - axs[row_idx, 0].set(ylabel=row_title[row_idx]) - - plt.tight_layout() - - -#################################### +# If you're trying to run that on Colab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +from helpers import plot +orig_img = Image.open(Path('../assets') / 'astronaut.jpg') + +# %% +# Geometric Transforms +# -------------------- +# Geometric image transformation refers to the process of altering the geometric properties of an image, +# such as its shape, size, orientation, or position. +# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation. +# # Pad -# --- +# ~~~ # The :class:`~torchvision.transforms.Pad` transform # (see also :func:`~torchvision.transforms.functional.pad`) -# fills image borders with some pixel values. -padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)] -plot(padded_imgs) +# pads all image borders with some pixel values. +padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)] +plot([orig_img] + padded_imgs) -#################################### +# %% # Resize -# ------ +# ~~~~~~ # The :class:`~torchvision.transforms.Resize` transform # (see also :func:`~torchvision.transforms.functional.resize`) # resizes an image. -resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] -plot(resized_imgs) +resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] +plot([orig_img] + resized_imgs) -#################################### +# %% # CenterCrop -# ---------- +# ~~~~~~~~~~ # The :class:`~torchvision.transforms.CenterCrop` transform # (see also :func:`~torchvision.transforms.functional.center_crop`) # crops the given image at the center. -center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] -plot(center_crops) +center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] +plot([orig_img] + center_crops) -#################################### +# %% # FiveCrop -# -------- +# ~~~~~~~~ # The :class:`~torchvision.transforms.FiveCrop` transform # (see also :func:`~torchvision.transforms.functional.five_crop`) # crops the given image into four corners and the central crop. -(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img) -plot([top_left, top_right, bottom_left, bottom_right, center]) - -#################################### -# Grayscale -# --------- -# The :class:`~torchvision.transforms.Grayscale` transform -# (see also :func:`~torchvision.transforms.functional.to_grayscale`) -# converts an image to grayscale -gray_img = T.Grayscale()(orig_img) -plot([gray_img], cmap='gray') - -#################################### -# Random transforms -# ----------------- -# The following transforms are random, which means that the same transfomer -# instance will produce different result each time it transforms a given image. -# -# ColorJitter -# ~~~~~~~~~~~ -# The :class:`~torchvision.transforms.ColorJitter` transform -# randomly changes the brightness, saturation, and other properties of an image. -jitter = T.ColorJitter(brightness=.5, hue=.3) -jitted_imgs = [jitter(orig_img) for _ in range(4)] -plot(jitted_imgs) - -#################################### -# GaussianBlur -# ~~~~~~~~~~~~ -# The :class:`~torchvision.transforms.GaussianBlur` transform -# (see also :func:`~torchvision.transforms.functional.gaussian_blur`) -# performs gaussian blur transform on an image. -blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)) -blurred_imgs = [blurrer(orig_img) for _ in range(4)] -plot(blurred_imgs) +(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img) +plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center]) -#################################### +# %% # RandomPerspective # ~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomPerspective` transform # (see also :func:`~torchvision.transforms.functional.perspective`) # performs random perspective transform on an image. -perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0) +perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0) perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)] -plot(perspective_imgs) +plot([orig_img] + perspective_imgs) -#################################### +# %% # RandomRotation # ~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomRotation` transform # (see also :func:`~torchvision.transforms.functional.rotate`) # rotates an image with random angle. -rotater = T.RandomRotation(degrees=(0, 180)) +rotater = v2.RandomRotation(degrees=(0, 180)) rotated_imgs = [rotater(orig_img) for _ in range(4)] -plot(rotated_imgs) +plot([orig_img] + rotated_imgs) -#################################### +# %% # RandomAffine # ~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomAffine` transform # (see also :func:`~torchvision.transforms.functional.affine`) # performs random affine transform on an image. -affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75)) +affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75)) affine_imgs = [affine_transfomer(orig_img) for _ in range(4)] -plot(affine_imgs) +plot([orig_img] + affine_imgs) -#################################### +# %% # ElasticTransform # ~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.ElasticTransform` transform # (see also :func:`~torchvision.transforms.functional.elastic_transform`) # Randomly transforms the morphology of objects in images and produces a # see-through-water-like effect. -elastic_transformer = T.ElasticTransform(alpha=250.0) +elastic_transformer = v2.ElasticTransform(alpha=250.0) transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) -#################################### +# %% # RandomCrop # ~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomCrop` transform # (see also :func:`~torchvision.transforms.functional.crop`) # crops an image at a random location. -cropper = T.RandomCrop(size=(128, 128)) +cropper = v2.RandomCrop(size=(128, 128)) crops = [cropper(orig_img) for _ in range(4)] -plot(crops) +plot([orig_img] + crops) -#################################### +# %% # RandomResizedCrop # ~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomResizedCrop` transform # (see also :func:`~torchvision.transforms.functional.resized_crop`) # crops an image at a random location, and then resizes the crop to a given # size. -resize_cropper = T.RandomResizedCrop(size=(32, 32)) +resize_cropper = v2.RandomResizedCrop(size=(32, 32)) resized_crops = [resize_cropper(orig_img) for _ in range(4)] -plot(resized_crops) +plot([orig_img] + resized_crops) + +# %% +# Photometric Transforms +# ---------------------- +# Photometric image transformation refers to the process of modifying the photometric properties of an image, +# such as its brightness, contrast, color, or tone. +# These transformations are applied to change the visual appearance of an image +# while preserving its geometric structure. +# +# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random, +# which means that the same transform +# instance will produce different result each time it transforms a given image. +# +# Grayscale +# ~~~~~~~~~ +# The :class:`~torchvision.transforms.Grayscale` transform +# (see also :func:`~torchvision.transforms.functional.to_grayscale`) +# converts an image to grayscale +gray_img = v2.Grayscale()(orig_img) +plot([orig_img, gray_img], cmap='gray') -#################################### +# %% +# ColorJitter +# ~~~~~~~~~~~ +# The :class:`~torchvision.transforms.ColorJitter` transform +# randomly changes the brightness, contrast, saturation, hue, and other properties of an image. +jitter = v2.ColorJitter(brightness=.5, hue=.3) +jittered_imgs = [jitter(orig_img) for _ in range(4)] +plot([orig_img] + jittered_imgs) + +# %% +# GaussianBlur +# ~~~~~~~~~~~~ +# The :class:`~torchvision.transforms.GaussianBlur` transform +# (see also :func:`~torchvision.transforms.functional.gaussian_blur`) +# performs gaussian blur transform on an image. +blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.)) +blurred_imgs = [blurrer(orig_img) for _ in range(4)] +plot([orig_img] + blurred_imgs) + +# %% # RandomInvert # ~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomInvert` transform # (see also :func:`~torchvision.transforms.functional.invert`) # randomly inverts the colors of the given image. -inverter = T.RandomInvert() +inverter = v2.RandomInvert() invertered_imgs = [inverter(orig_img) for _ in range(4)] -plot(invertered_imgs) +plot([orig_img] + invertered_imgs) -#################################### +# %% # RandomPosterize # ~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomPosterize` transform # (see also :func:`~torchvision.transforms.functional.posterize`) # randomly posterizes the image by reducing the number of bits # of each color channel. -posterizer = T.RandomPosterize(bits=2) +posterizer = v2.RandomPosterize(bits=2) posterized_imgs = [posterizer(orig_img) for _ in range(4)] -plot(posterized_imgs) +plot([orig_img] + posterized_imgs) -#################################### +# %% # RandomSolarize # ~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomSolarize` transform # (see also :func:`~torchvision.transforms.functional.solarize`) # randomly solarizes the image by inverting all pixel values above # the threshold. -solarizer = T.RandomSolarize(threshold=192.0) +solarizer = v2.RandomSolarize(threshold=192.0) solarized_imgs = [solarizer(orig_img) for _ in range(4)] -plot(solarized_imgs) +plot([orig_img] + solarized_imgs) -#################################### +# %% # RandomAdjustSharpness # ~~~~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`) # randomly adjusts the sharpness of the given image. -sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2) +sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2) sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)] -plot(sharpened_imgs) +plot([orig_img] + sharpened_imgs) -#################################### +# %% # RandomAutocontrast # ~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomAutocontrast` transform # (see also :func:`~torchvision.transforms.functional.autocontrast`) # randomly applies autocontrast to the given image. -autocontraster = T.RandomAutocontrast() +autocontraster = v2.RandomAutocontrast() autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)] -plot(autocontrasted_imgs) +plot([orig_img] + autocontrasted_imgs) -#################################### +# %% # RandomEqualize # ~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomEqualize` transform # (see also :func:`~torchvision.transforms.functional.equalize`) # randomly equalizes the histogram of the given image. -equalizer = T.RandomEqualize() +equalizer = v2.RandomEqualize() equalized_imgs = [equalizer(orig_img) for _ in range(4)] -plot(equalized_imgs) +plot([orig_img] + equalized_imgs) -#################################### +# %% +# JPEG +# ~~~~~~~~~~~~~~ +# The :class:`~torchvision.transforms.v2.JPEG` transform +# (see also :func:`~torchvision.transforms.v2.functional.jpeg`) +# applies JPEG compression to the given image with random +# degree of compression. +jpeg = v2.JPEG((5, 50)) +jpeg_imgs = [jpeg(orig_img) for _ in range(4)] +plot([orig_img] + jpeg_imgs) + +# %% +# Augmentation Transforms +# ----------------------- +# The following transforms are combinations of multiple transforms, +# either geometric or photometric, or both. +# # AutoAugment # ~~~~~~~~~~~ # The :class:`~torchvision.transforms.AutoAugment` transform # automatically augments data based on a given auto-augmentation policy. # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies. -policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN] -augmenters = [T.AutoAugment(policy) for policy in policies] +policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN] +augmenters = [v2.AutoAugment(policy) for policy in policies] imgs = [ [augmenter(orig_img) for _ in range(4)] for augmenter in augmenters ] row_title = [str(policy).split('.')[-1] for policy in policies] -plot(imgs, row_title=row_title) +plot([[orig_img] + row for row in imgs], row_title=row_title) -#################################### +# %% # RandAugment # ~~~~~~~~~~~ -# The :class:`~torchvision.transforms.RandAugment` transform automatically augments the data. -augmenter = T.RandAugment() +# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment. +augmenter = v2.RandAugment() imgs = [augmenter(orig_img) for _ in range(4)] -plot(imgs) +plot([orig_img] + imgs) -#################################### +# %% # TrivialAugmentWide # ~~~~~~~~~~~~~~~~~~ -# The :class:`~torchvision.transforms.TrivialAugmentWide` transform automatically augments the data. -augmenter = T.TrivialAugmentWide() +# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment. +# However, instead of transforming an image multiple times, it transforms an image only once +# using a random transform from a given list with a random strength number. +augmenter = v2.TrivialAugmentWide() imgs = [augmenter(orig_img) for _ in range(4)] -plot(imgs) +plot([orig_img] + imgs) -#################################### +# %% # AugMix # ~~~~~~ -# The :class:`~torchvision.transforms.AugMix` transform automatically augments the data. -augmenter = T.AugMix() +# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image. +augmenter = v2.AugMix() imgs = [augmenter(orig_img) for _ in range(4)] -plot(imgs) +plot([orig_img] + imgs) -#################################### -# Randomly-applied transforms +# %% +# Randomly-applied Transforms # --------------------------- # -# Some transforms are randomly-applied given a probability ``p``. That is, the -# transformed image may actually be the same as the original one, even when -# called with the same transformer instance! +# The following transforms are randomly-applied given a probability ``p``. That is, given ``p = 0.5``, +# there is a 50% chance to return the original image, and a 50% chance to return the transformed image, +# even when called with the same transform instance! # # RandomHorizontalFlip # ~~~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform # (see also :func:`~torchvision.transforms.functional.hflip`) # performs horizontal flip of an image, with a given probability. -hflipper = T.RandomHorizontalFlip(p=0.5) +hflipper = v2.RandomHorizontalFlip(p=0.5) transformed_imgs = [hflipper(orig_img) for _ in range(4)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) -#################################### +# %% # RandomVerticalFlip # ~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomVerticalFlip` transform # (see also :func:`~torchvision.transforms.functional.vflip`) # performs vertical flip of an image, with a given probability. -vflipper = T.RandomVerticalFlip(p=0.5) +vflipper = v2.RandomVerticalFlip(p=0.5) transformed_imgs = [vflipper(orig_img) for _ in range(4)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) -#################################### +# %% # RandomApply # ~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomApply` transform # randomly applies a list of transforms, with a given probability. -applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5) +applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5) transformed_imgs = [applier(orig_img) for _ in range(4)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) diff --git a/gallery/transforms/plot_tv_tensors.py b/gallery/transforms/plot_tv_tensors.py new file mode 100644 index 00000000000..2c6ebbf9031 --- /dev/null +++ b/gallery/transforms/plot_tv_tensors.py @@ -0,0 +1,233 @@ +""" +============= +TVTensors FAQ +============= + +.. note:: + Try on `Colab `_ + or :ref:`go to the end ` to download the full example code. + + +TVTensors are Tensor subclasses introduced together with +``torchvision.transforms.v2``. This example showcases what these TVTensors are +and how they behave. + +.. warning:: + + **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you + probably do not need to read this guide. This is a fairly low-level topic + that most users will not need to worry about: you do not need to understand + the internals of TVTensors to efficiently rely on + ``torchvision.transforms.v2``. It may however be useful for advanced users + trying to implement their own datasets, transforms, or work directly with + the TVTensors. +""" + +# %% +import PIL.Image + +import torch +from torchvision import tv_tensors + + +# %% +# What are TVTensors? +# ------------------- +# +# TVTensors are zero-copy tensor subclasses: + +tensor = torch.rand(3, 256, 256) +image = tv_tensors.Image(tensor) + +assert isinstance(image, torch.Tensor) +assert image.data_ptr() == tensor.data_ptr() + +# %% +# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function +# for the input data. +# +# :mod:`torchvision.tv_tensors` supports five types of TVTensors: +# +# * :class:`~torchvision.tv_tensors.Image` +# * :class:`~torchvision.tv_tensors.Video` +# * :class:`~torchvision.tv_tensors.BoundingBoxes` +# * :class:`~torchvision.tv_tensors.KeyPoints` +# * :class:`~torchvision.tv_tensors.Mask` +# +# What can I do with a TVTensor? +# ------------------------------ +# +# TVTensors look and feel just like regular tensors - they **are** tensors. +# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or +# any ``torch.*`` operator will also work on TVTensors. See +# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas. + +# %% +# .. _tv_tensor_creation: +# +# How do I construct a TVTensor? +# ------------------------------ +# +# Using the constructor +# ^^^^^^^^^^^^^^^^^^^^^ +# +# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor` + +image = tv_tensors.Image([[[[0, 1], [1, 0]]]]) +print(image) + + +# %% +# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad`` +# parameters. + +float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True) +print(float_image) + + +# %% +# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a +# :class:`PIL.Image.Image` directly: + +image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg")) +print(image.shape, image.dtype) + +# %% +# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example, +# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the +# corresponding image (``canvas_size``) alongside the actual values. These +# metadata are required to properly transform the bounding boxes. +# In a similar fashion, :class:`~torchvision.tv_tensors.KeyPoints` also require the ``canvas_size`` metadata to be added. + +bboxes = tv_tensors.BoundingBoxes( + [[17, 16, 344, 495], [0, 10, 0, 10]], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=image.shape[-2:] +) +print(bboxes) + + +keypoints = tv_tensors.KeyPoints( + [[17, 16], [344, 495], [0, 10], [0, 10]], + canvas_size=image.shape[-2:] +) +print(keypoints) + +# %% +# Using ``tv_tensors.wrap()`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object +# into a TVTensor. This is useful when you already have an object of the +# desired type, which typically happens when writing transforms: you just want +# to wrap the output like the input. + +new_bboxes = torch.tensor([0, 20, 30, 40]) +new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes) +assert isinstance(new_bboxes, tv_tensors.BoundingBoxes) +assert new_bboxes.canvas_size == bboxes.canvas_size + +# %% +# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass +# it as a parameter to override it. +# +# .. _tv_tensor_unwrapping_behaviour: +# +# I had a TVTensor but now I have a Tensor. Help! +# ----------------------------------------------- +# +# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects +# will return a pure Tensor: + + +assert isinstance(bboxes, tv_tensors.BoundingBoxes) + +# Shift bboxes by 3 pixels in both H and W +new_bboxes = bboxes + 3 + +assert isinstance(new_bboxes, torch.Tensor) +assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes) + +# %% +# .. note:: +# +# This behavior only affects native ``torch`` operations. If you are using +# the built-in ``torchvision`` transforms or functionals, you will always get +# as output the same type that you passed as input (pure ``Tensor`` or +# ``TVTensor``). + +# %% +# But I want a TVTensor back! +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor +# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function +# (see more details above in :ref:`tv_tensor_creation`): + +new_bboxes = bboxes + 3 +new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes) +assert isinstance(new_bboxes, tv_tensors.BoundingBoxes) + +# %% +# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type` +# as a global config setting for the whole program, or as a context manager +# (read its docs to learn more about caveats): + +with tv_tensors.set_return_type("TVTensor"): + new_bboxes = bboxes + 3 +assert isinstance(new_bboxes, tv_tensors.BoundingBoxes) + +# %% +# Why is this happening? +# ^^^^^^^^^^^^^^^^^^^^^^ +# +# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor` +# classes are Tensor subclasses, so any operation involving a +# :class:`~torchvision.tv_tensors.TVTensor` object will go through the +# `__torch_function__ +# `_ +# protocol. This induces a small overhead, which we want to avoid when possible. +# This doesn't matter for built-in ``torchvision`` transforms because we can +# avoid the overhead there, but it could be a problem in your model's +# ``forward``. +# +# **The alternative isn't much better anyway.** For every operation where +# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes +# sense, there are just as many operations where returning a pure Tensor is +# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`? +# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all +# the way, even model's logits or the output of the loss function would end up +# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not +# desirable. +# +# .. note:: +# +# This behaviour is something we're actively seeking feedback on. If you find this surprising or if you +# have any suggestions on how to better support your use-cases, please reach out to us via this issue: +# https://github.com/pytorch/vision/issues/7319 +# +# Exceptions +# ^^^^^^^^^^ +# +# There are a few exceptions to this "unwrapping" rule: +# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, +# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain +# the TVTensor type. +# +# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of +# ``obj``. However, the **returned** value of inplace operations will be a pure +# tensor: + +image = tv_tensors.Image([[[0, 1], [1, 0]]]) + +new_image = image.add_(1).mul_(2) + +# image got transformed in-place and is still a TVTensor Image, but new_image +# is a Tensor. They share the same underlying data and they're equal, just +# different classes. +assert isinstance(image, tv_tensors.Image) +print(image) + +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image) +assert (new_image == image).all() +assert new_image.data_ptr() == image.data_ptr() diff --git a/hubconf.py b/hubconf.py index 57ce7a0d12a..637827127ca 100644 --- a/hubconf.py +++ b/hubconf.py @@ -20,6 +20,7 @@ ) from torchvision.models.googlenet import googlenet from torchvision.models.inception import inception_v3 +from torchvision.models.maxvit import maxvit_t from torchvision.models.mnasnet import mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3 from torchvision.models.mobilenetv2 import mobilenet_v2 from torchvision.models.mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small @@ -68,6 +69,17 @@ shufflenet_v2_x2_0, ) from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1 -from torchvision.models.swin_transformer import swin_b, swin_s, swin_t +from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn +from torchvision.models.video import ( + mc3_18, + mvit_v1_b, + mvit_v2_s, + r2plus1d_18, + r3d_18, + s3d, + swin3d_b, + swin3d_s, + swin3d_t, +) from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32 diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt index 6b9fd3925b2..4201240a427 100644 --- a/ios/CMakeLists.txt +++ b/ios/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.4.1) set(TARGET torchvision_ops) project(${TARGET} CXX) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) set(LIBTORCH_HEADER_ROOT ${LIBTORCH_HEADER_ROOT}) set(LIBRARY_OUTPUT_PATH ../lib) diff --git a/ios/LibTorchvision.podspec b/ios/LibTorchvision.podspec index ba87820e142..b88fb70ac40 100644 --- a/ios/LibTorchvision.podspec +++ b/ios/LibTorchvision.podspec @@ -1,8 +1,8 @@ -pytorch_version = '1.12.0' +pytorch_version = '2.0.0' Pod::Spec.new do |s| s.name = 'LibTorchvision' - s.version = '0.13.0' + s.version = '0.15.1' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/vision' diff --git a/ios/README.md b/ios/README.md new file mode 100644 index 00000000000..0b50245f1ee --- /dev/null +++ b/ios/README.md @@ -0,0 +1,3 @@ +## Status + +The iOS demo of TorchVision is currently unmaintained, untested and likely out-of-date. diff --git a/ios/VisionTestApp/VisionTestApp/AppDelegate.h b/ios/VisionTestApp/VisionTestApp/AppDelegate.h index 0dde86886e6..27716f4b6ab 100644 --- a/ios/VisionTestApp/VisionTestApp/AppDelegate.h +++ b/ios/VisionTestApp/VisionTestApp/AppDelegate.h @@ -1,7 +1,7 @@ #import -@interface AppDelegate : UIResponder +@interface AppDelegate : UIResponder -@property(strong, nonatomic) UIWindow *window; +@property(strong, nonatomic) UIWindow* window; @end diff --git a/ios/VisionTestApp/VisionTestApp/ModelRunner.h b/ios/VisionTestApp/VisionTestApp/ModelRunner.h index f71c80c981c..cfef3a3f347 100644 --- a/ios/VisionTestApp/VisionTestApp/ModelRunner.h +++ b/ios/VisionTestApp/VisionTestApp/ModelRunner.h @@ -5,7 +5,7 @@ NS_ASSUME_NONNULL_BEGIN @interface ModelRunner : NSObject -+ (NSString* )run; ++ (NSString*)run; + (BOOL)setUp; @end diff --git a/ios/VisionTestApp/VisionTestApp/ViewController.h b/ios/VisionTestApp/VisionTestApp/ViewController.h index 82cb7c57f8a..d29a133d373 100644 --- a/ios/VisionTestApp/VisionTestApp/ViewController.h +++ b/ios/VisionTestApp/VisionTestApp/ViewController.h @@ -3,5 +3,4 @@ @interface ViewController : UIViewController - @end diff --git a/maintainer_guide.md b/maintainer_guide.md new file mode 100644 index 00000000000..3d66a701be1 --- /dev/null +++ b/maintainer_guide.md @@ -0,0 +1,76 @@ +# Torchvision maintainers guide + +This document aims at documenting user-facing policies / principles used when +developing and maintaining torchvision. Other maintainer info (e.g. release +process) can be found in the meta-internal wiki. + +### What is public and what is private? + +For the Python API, torchvision largely follows the [PyTorch +policy](https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation) +which is consistent with other major packages +([numpy](https://numpy.org/neps/nep-0023-backwards-compatibility.html), +[scikit-learn](https://scikit-learn.org/dev/glossary.html#term-API) etc.). +We recognize that his policy is somewhat imperfect for some edge cases, and that +it's difficult to come up with an accurate technical definition. In broad terms, +which are usually well understood by users, the policy is that: + +- modules that can be accessed without leading underscore are public +- objects in a public file that don't have a leading underscore are public +- class attributes are public iff they have no leading underscore +- the rest of the modules / objects / class attributes are considered private + +The public API has backward-compatible (BC) guarantees defined in our +deprecation policy (see below). The private API has not BC guarantees. + +For C++, code is private. For Meta employees: if a C++ change breaks fbcode, fix +fbcode or revert the change. We should be careful about models running in +production and relying on torchvision ops. + +The `test` folder is not importable and is **private.** Even meta-internal +projects should *not* rely on it (it has happened in the past and is now +programmatically impossible). + +The training references do not have BC guarantees. Breaking changes are +possible, but we should make sure that the tutorials are still running properly, +and that their intended narrative is preserved (by e.g. checking outputs, +etc.). + +The rest of the folders (build, android, ios, etc.) are private and have no BC +guarantees. + +### Deprecation policy. + +Because they're disruptive, **deprecations should only be used sparingly**. + +We largely follow the [PyTorch +policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy): +breaking changes require a deprecation period of at least 2 versions. + +Deprecations should clearly indicate their deadline in the docs and warning +messages. Avoid not committing to a deadline, or keeping deprecated APIs for too +long: it gives no incentive for users to update their code, sends conflicting +messages ("why was this API removed while this other one is still around?"), and +accumulates debt in the project. + +### Should this attribute be public? Should this function be private? + +When designing an API it’s not always obvious what should be exposed as public, +and what should be kept as a private implementation detail. The following +guidelines can be useful: + +* Functional consistency throughout the library is a top priority, for users and + developers’ sake. In doubt and unless it’s clearly wrong, expose what other + similar classes expose. +* Think really hard about the users and their use-cases, and try to expose what + they would need to address those use-cases. Aggressively keep everything else + private. Remember that the “private -> public” direction is way smoother than + the “public -> private” one: in doubt, keep it private. +* When thinking about use-cases, the general API motto applies: make what’s + simple and common easy, and make what’s complex possible (80% / 20% rule). + There might be a ~1% left that’s not addressed: that’s OK. Also, **make what’s + wrong very hard**, if not impossible. + +As a good practice, always create new files and even classes with a leading +underscore in their name. This way, everything is private by default and the +only public surface is explicitly present in an `__init__.py` file. diff --git a/mypy.ini b/mypy.ini index c1d174f4595..e25212a169d 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,7 +7,7 @@ allow_redefinition = True no_implicit_optional = True warn_redundant_casts = True -[mypy-torchvision.prototype.features.*] +[mypy-torchvision.prototype.datapoints.*] ; untyped definitions and calls disallow_untyped_defs = True @@ -17,95 +17,43 @@ no_implicit_optional = True ; warnings warn_unused_ignores = True -warn_return_any = True ; miscellaneous strictness flags allow_redefinition = True [mypy-torchvision.prototype.transforms.*] -; untyped definitions and calls -disallow_untyped_defs = True - -; None and Optional handling -no_implicit_optional = True - -; warnings -warn_unused_ignores = True -warn_return_any = True - -; miscellaneous strictness flags -allow_redefinition = True - -[mypy-torchvision.prototype.datasets.*] - -; untyped definitions and calls -disallow_untyped_defs = True - -; None and Optional handling -no_implicit_optional = True - -; warnings -warn_unused_ignores = True -warn_return_any = True -warn_unreachable = True - -; miscellaneous strictness flags -allow_redefinition = True - -[mypy-torchvision.io.image.*] - -ignore_errors = True - -[mypy-torchvision.io.video.*] - -ignore_errors = True - -[mypy-torchvision.models.densenet.*] - -ignore_errors=True - -[mypy-torchvision.models.detection.anchor_utils] - -ignore_errors = True - -[mypy-torchvision.models.detection.transform] - ignore_errors = True -[mypy-torchvision.models.detection.roi_heads] - -ignore_errors = True - -[mypy-torchvision.models.detection.faster_rcnn] +[mypy-torchvision.prototype.datasets.*] ignore_errors = True -[mypy-torchvision.models.detection.mask_rcnn] +[mypy-torchvision.prototype.models.*] ignore_errors = True -[mypy-torchvision.models.detection.keypoint_rcnn] +[mypy-torchvision.io.image.*] ignore_errors = True -[mypy-torchvision.models.detection.retinanet] +[mypy-torchvision.io.video.*] ignore_errors = True -[mypy-torchvision.models.detection.ssd] +[mypy-torchvision.io.video_reader] ignore_errors = True -[mypy-torchvision.models.detection.ssdlite] +[mypy-torchvision.models.*] -ignore_errors = True +ignore_errors=True -[mypy-torchvision.models.detection.fcos] +[mypy-torchvision.ops.*] ignore_errors = True -[mypy-torchvision.ops.*] +[mypy-torchvision.transforms._functional_pil] ignore_errors = True @@ -156,3 +104,7 @@ ignore_missing_imports = True [mypy-h5py.*] ignore_missing_imports = True + +[mypy-gdown.*] + +ignore_missing_imports = True diff --git a/packaging/README.md b/packaging/README.md deleted file mode 100644 index 3ceac53030e..00000000000 --- a/packaging/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Building torchvision packages for release - -TorchVision release packages are built by using `build_wheel.sh` and `build_conda.sh` for all permutations of -supported operating systems, compute platforms and python versions. - -OS/Python/Compute matrix is defined in https://github.com/pytorch/vision/blob/main/.circleci/regenerate.py diff --git a/packaging/build_cmake.sh b/packaging/build_cmake.sh deleted file mode 100755 index 35dfbc4a697..00000000000 --- a/packaging/build_cmake.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash -set -ex - -PARALLELISM=8 -if [ -n "$MAX_JOBS" ]; then - PARALLELISM=$MAX_JOBS -fi - -if [[ "$(uname)" != Darwin && "$OSTYPE" != "msys" ]]; then - eval "$(./conda/bin/conda shell.bash hook)" - conda activate ./env -fi - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -. "$script_dir/pkg_helpers.bash" - -export BUILD_TYPE=conda -setup_env -export SOURCE_ROOT_DIR="$PWD" -setup_conda_pytorch_constraint -setup_conda_cudatoolkit_plain_constraint - -if [[ "$OSTYPE" == "msys" ]]; then - conda install -yq conda-build cmake future - pip install dataclasses -fi - -setup_visual_studio_constraint -setup_junit_results_folder - -if [[ "$(uname)" == Darwin ]]; then - # TODO: this can be removed as soon as mkl's CMake support works with clang - # see https://github.com/pytorch/vision/pull/4203 for details - MKL_CONSTRAINT='mkl==2021.2.0' -else - MKL_CONSTRAINT='' -fi - -if [[ $CONDA_BUILD_VARIANT == "cpu" ]]; then - PYTORCH_MUTEX_CONSTRAINT='pytorch-mutex=1.0=cpu' -else - PYTORCH_MUTEX_CONSTRAINT='' -fi - -conda install -yq \pytorch=$PYTORCH_VERSION $CONDA_CUDATOOLKIT_CONSTRAINT $PYTORCH_MUTEX_CONSTRAINT $MKL_CONSTRAINT numpy -c nvidia -c "pytorch-${UPLOAD_CHANNEL}" -TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)")) - -if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then - conda install -yq libpng jpeg -else - yum install -y libpng-devel libjpeg-turbo-devel -fi - -if [[ "$OSTYPE" == "msys" ]]; then - source .circleci/unittest/windows/scripts/set_cuda_envs.sh -fi - -mkdir cpp_build -pushd cpp_build - -# Generate libtorchvision files -cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA - -# Compile and install libtorchvision -if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cmake.bat" $PARALLELISM - CONDA_PATH=$(dirname $(which python)) - cp -r "C:/Program Files (x86)/torchvision/include/torchvision" $CONDA_PATH/include -else - make -j$PARALLELISM - make install - - if [[ "$(uname)" == Darwin ]]; then - CONDA_PATH=$(dirname $(dirname $(which python))) - cp -r /usr/local/include/torchvision $CONDA_PATH/include/ - export C_INCLUDE_PATH=/usr/local/include - export CPLUS_INCLUDE_PATH=/usr/local/include - fi -fi - -popd - -# Install torchvision locally -python setup.py develop - -# Trace, compile and run project that uses Faster-RCNN -pushd test/tracing/frcnn -mkdir build - -# Trace model -python trace_model.py -cp fasterrcnn_resnet50_fpn.pt build - -cd build -cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA -if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_frcnn.bat" $PARALLELISM - mv fasterrcnn_resnet50_fpn.pt Release - cd Release - export PATH=$(cygpath "C:/Program Files (x86)/torchvision/bin"):$(cygpath $TORCH_PATH)/lib:$PATH -else - make -j$PARALLELISM -fi - -# Run traced program -./test_frcnn_tracing - -# Compile and run the CPP example -popd -cd examples/cpp/hello_world -mkdir build - -# Trace model -python trace_model.py -cp resnet18.pt build - -cd build -cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch - -if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cpp_example.bat" $PARALLELISM - mv resnet18.pt Release - cd Release -else - make -j$PARALLELISM -fi - -# Run CPP example -./hello-world diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh deleted file mode 100755 index 7c45aa3e6d9..00000000000 --- a/packaging/build_conda.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -ex - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -. "$script_dir/pkg_helpers.bash" - -export BUILD_TYPE=conda -setup_env -export SOURCE_ROOT_DIR="$PWD" -setup_conda_pytorch_constraint -setup_conda_cudatoolkit_constraint -setup_visual_studio_constraint -setup_junit_results_folder -export CUDATOOLKIT_CHANNEL="nvidia" - -conda build -c $CUDATOOLKIT_CHANNEL -c defaults $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchvision diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh deleted file mode 100755 index 3299d16ec92..00000000000 --- a/packaging/build_wheel.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -set -ex - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -. "$script_dir/pkg_helpers.bash" - -export BUILD_TYPE=wheel -setup_env -setup_wheel_python -pip_install numpy pyyaml future ninja -pip_install --upgrade setuptools -setup_pip_pytorch_version -python setup.py clean - -# Copy binaries to be included in the wheel distribution -if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then - python_exec="$(which python)" - bin_path=$(dirname $python_exec) - env_path=$(dirname $bin_path) - if [[ "$(uname)" == Darwin ]]; then - # Install delocate to relocate the required binaries - pip_install "delocate>=0.9" - else - cp "$bin_path/Library/bin/libpng16.dll" torchvision - cp "$bin_path/Library/bin/libjpeg.dll" torchvision - fi -else - # Install auditwheel to get some inspection utilities - pip_install auditwheel - - # Point to custom libraries - export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH - export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include - export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib -fi - -download_copy_ffmpeg - -if [[ "$OSTYPE" == "msys" ]]; then - IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel -else - IS_WHEEL=1 python setup.py bdist_wheel -fi - - -if [[ "$(uname)" == Darwin ]]; then - pushd dist/ - python_exec="$(which python)" - bin_path=$(dirname $python_exec) - env_path=$(dirname $bin_path) - for whl in *.whl; do - DYLD_FALLBACK_LIBRARY_PATH="$env_path/lib/:$DYLD_FALLBACK_LIBRARY_PATH" delocate-wheel -v --ignore-missing-dependencies $whl - done -else - if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py - else - LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py - fi -fi diff --git a/packaging/cut_release.sh b/packaging/cut_release.sh new file mode 100755 index 00000000000..91e0e5ff15d --- /dev/null +++ b/packaging/cut_release.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# +# Usage (run from root of project): +# TEST_INFRA_BRANCH=release/2.1 RELEASE_BRANCH=release/2.1 RELEASE_VERSION=2.1.0 packaging/cut_release.sh +# +# TEST_INFRA_BRANCH: The release branch of test-infra that houses all reusable +# workflows +# +# RELEASE_BRANCH: The name of the release branch for this repo +# +# RELEASE_VERSION: Version of this current release + +set -eou pipefail + +# Create and Check out to Release Branch +git checkout -b "${RELEASE_BRANCH}" + +# Change all GitHub Actions to reference the test-infra release branch +# as opposed to main. +for i in .github/workflows/*.yml; do + if [[ "$OSTYPE" == "darwin"* ]]; then + sed -i '' -e s#@main#@"${TEST_INFRA_BRANCH}"# $i; + sed -i '' -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i; + else + sed -i -e s#@main#@"${TEST_INFRA_BRANCH}"# $i; + sed -i -e s#test-infra-ref:[[:space:]]main#"test-infra-ref: ${TEST_INFRA_BRANCH}"# $i; + fi +done + +# Update the Release Version in version.txt +echo "${RELEASE_VERSION}" >version.txt + +# Optional +# git add ./github/workflows/*.yml version.txt +# git commit -m "[RELEASE-ONLY CHANGES] Branch Cut for Release {RELEASE_VERSION}" +# git push origin "${RELEASE_BRANCH}" diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash deleted file mode 100644 index ad0f4f94d2f..00000000000 --- a/packaging/pkg_helpers.bash +++ /dev/null @@ -1,359 +0,0 @@ -# A set of useful bash functions for common functionality we need to do in -# many build scripts - - -# Setup CUDA environment variables, based on CU_VERSION -# -# Inputs: -# CU_VERSION (cpu, cu92, cu100) -# NO_CUDA_PACKAGE (bool) -# BUILD_TYPE (conda, wheel) -# -# Outputs: -# VERSION_SUFFIX (e.g., "") -# PYTORCH_VERSION_SUFFIX (e.g., +cpu) -# WHEEL_DIR (e.g., cu100/) -# CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension) -# FORCE_CUDA (respected by torchvision setup.py) -# NVCC_FLAGS (respected by torchvision setup.py) -# -# Precondition: CUDA versions are installed in their conventional locations in -# /usr/local/cuda-* -# -# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX? If you're building -# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX == -# PYTORCH_VERSION_SUFFIX and everyone is happy. However, if you are building a -# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always -# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU -# version of a Python package. But that doesn't apply if you're on OS X, -# since the default CU_VERSION on OS X is cpu. -setup_cuda() { - - # First, compute version suffixes. By default, assume no version suffixes - export VERSION_SUFFIX="" - export PYTORCH_VERSION_SUFFIX="" - export WHEEL_DIR="" - # Wheel builds need suffixes (but not if they're on OS X, which never has suffix) - if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then - export PYTORCH_VERSION_SUFFIX="+$CU_VERSION" - # Match the suffix scheme of pytorch, unless this package does not have - # CUDA builds (in which case, use default) - if [[ -z "$NO_CUDA_PACKAGE" ]]; then - export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX" - export WHEEL_DIR="$CU_VERSION/" - fi - fi - - # Now work out the CUDA settings - case "$CU_VERSION" in - cu116) - if [[ "$OSTYPE" == "msys" ]]; then - export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6" - else - export CUDA_HOME=/usr/local/cuda-11.6/ - fi - export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" - ;; - cu113) - if [[ "$OSTYPE" == "msys" ]]; then - export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.3" - else - export CUDA_HOME=/usr/local/cuda-11.3/ - fi - export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" - ;; - cu102) - if [[ "$OSTYPE" == "msys" ]]; then - export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2" - else - export CUDA_HOME=/usr/local/cuda-10.2/ - fi - export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5" - ;; - cpu) - ;; - rocm*) - export FORCE_CUDA=1 - ;; - *) - echo "Unrecognized CU_VERSION=$CU_VERSION" - exit 1 - ;; - esac - if [[ -n "$CUDA_HOME" ]]; then - # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one - export PATH="$CUDA_HOME/bin:$PATH" - export FORCE_CUDA=1 - fi -} - -# Populate build version if necessary, and add version suffix -# -# Inputs: -# BUILD_VERSION (e.g., 0.2.0 or empty) -# VERSION_SUFFIX (e.g., +cpu) -# -# Outputs: -# BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu) -# -# Fill BUILD_VERSION if it doesn't exist already with a nightly string -# Usage: setup_build_version 0.2.0 -setup_build_version() { - if [[ -z "$BUILD_VERSION" ]]; then - if [[ -z "$1" ]]; then - setup_base_build_version - else - BUILD_VERSION="$1" - fi - BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX" - else - BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX" - fi - - # Set build version based on tag if on tag - if [[ -n "${CIRCLE_TAG}" ]]; then - # Strip tag - BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}" - fi - - export BUILD_VERSION -} - -setup_base_build_version() { - SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - # version.txt for some reason has `a` character after major.minor.rev - # command below yields 0.10.0 from version.txt containing 0.10.0a0 - BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" ) - export BUILD_VERSION -} - -# Set some useful variables for OS X, if applicable -setup_macos() { - if [[ "$(uname)" == Darwin ]]; then - export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ - fi -} - - -# Top-level entry point for things every package will need to do -# -# Usage: setup_env 0.2.0 -setup_env() { - setup_cuda - setup_build_version "$1" - setup_macos -} - -# Function to retry functions that sometimes timeout or have flaky failures -retry () { - $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) -} - -# Inputs: -# PYTHON_VERSION (3.7, 3.8, 3.9) -# UNICODE_ABI (bool) -# -# Outputs: -# PATH modified to put correct Python version in PATH -# -# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image -setup_wheel_python() { - if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then - eval "$(conda shell.bash hook)" - conda env remove -n "env$PYTHON_VERSION" || true - conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION" - conda activate "env$PYTHON_VERSION" - # Install libpng from Anaconda (defaults) - conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y - else - # Install native CentOS libJPEG, freetype and GnuTLS - yum install -y libjpeg-turbo-devel freetype gnutls - case "$PYTHON_VERSION" in - 3.7) python_abi=cp37-cp37m ;; - 3.8) python_abi=cp38-cp38 ;; - 3.9) python_abi=cp39-cp39 ;; - 3.10) python_abi=cp310-cp310 ;; - *) - echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION" - exit 1 - ;; - esac - # Download all the dependencies required to compile image and video_reader - # extensions - - mkdir -p ext_libraries - pushd ext_libraries - popd - export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH" - fi -} - -# Install with pip a bit more robustly than the default -pip_install() { - retry pip install --progress-bar off "$@" -} - -# Install torch with pip, respecting PYTORCH_VERSION, and record the installed -# version into PYTORCH_VERSION, if applicable -setup_pip_pytorch_version() { - if [[ -z "$PYTORCH_VERSION" ]]; then - # Install latest prerelease version of torch, per our nightlies, consistent - # with the requested cuda version - pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html" - if [[ "$CUDA_VERSION" == "cpu" ]]; then - # CUDA and CPU are ABI compatible on the CPU-only parts, so strip - # in this case - export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version: *//' | sed 's/+.\+//')" - else - export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version: *//')" - fi - else - pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \ - -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \ - -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html" - fi -} - -# Fill PYTORCH_VERSION with the latest conda nightly version, and -# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions -# -# You MUST have populated PYTORCH_VERSION_SUFFIX before hand. -setup_conda_pytorch_constraint() { - if [[ -z "$PYTORCH_VERSION" ]]; then - export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-nightly -c pytorch" - PYTHON="python" - # Check if we have python 3 instead and prefer that - if python3 --version >/dev/null 2>/dev/null; then - PYTHON="python3" - fi - export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \ - ${PYTHON} -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \ - cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \ - cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \ - print(re.sub(r'\\+.*$', '', \ - [x['version'] for x in json.load(sys.stdin)['pytorch'] \ - if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \ - and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")" - if [[ -z "$PYTORCH_VERSION" ]]; then - echo "PyTorch version auto detection failed" - echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION" - exit 1 - fi - else - export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}" - fi - if [[ "$CU_VERSION" == cpu ]]; then - export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}" - export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION" - else - export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}" - export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}" - fi - if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then - export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev" - fi -} - -# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT -setup_conda_cudatoolkit_constraint() { - export CONDA_BUILD_VARIANT="cuda" - if [[ "$(uname)" == Darwin ]]; then - export CONDA_BUILD_VARIANT="cpu" - else - case "$CU_VERSION" in - cu117) - export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]" - ;; - cu116) - export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]" - ;; - cu113) - export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.3,<11.4 # [not osx]" - ;; - cu102) - export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]" - ;; - cpu) - export CONDA_CUDATOOLKIT_CONSTRAINT="" - export CONDA_BUILD_VARIANT="cpu" - ;; - *) - echo "Unrecognized CU_VERSION=$CU_VERSION" - exit 1 - ;; - esac - fi -} - -setup_conda_cudatoolkit_plain_constraint() { - export CONDA_BUILD_VARIANT="cuda" - export CMAKE_USE_CUDA=1 - if [[ "$(uname)" == Darwin ]]; then - export CONDA_BUILD_VARIANT="cpu" - export CMAKE_USE_CUDA=0 - else - case "$CU_VERSION" in - cu117) - export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7" - ;; - cu116) - export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.6" - ;; - cu113) - export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=11.3" - ;; - cu102) - export CONDA_CUDATOOLKIT_CONSTRAINT="cudatoolkit=10.2" - ;; - cpu) - export CONDA_CUDATOOLKIT_CONSTRAINT="" - export CONDA_BUILD_VARIANT="cpu" - export CMAKE_USE_CUDA=0 - ;; - *) - echo "Unrecognized CU_VERSION=$CU_VERSION" - exit 1 - ;; - esac - fi -} - -# Build the proper compiler package before building the final package -setup_visual_studio_constraint() { - if [[ "$OSTYPE" == "msys" ]]; then - export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR - conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE - cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml - fi -} - -setup_junit_results_folder() { - if [[ "$CI" == "true" ]]; then - export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml" - fi -} - - -download_copy_ffmpeg() { - if [[ "$OSTYPE" == "msys" ]]; then - # conda install -yq ffmpeg=4.2 -c pytorch - # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2 - # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=- - # cp Library/bin/*.dll ../torchvision - echo "FFmpeg is disabled currently on Windows" - else - if [[ "$(uname)" == Darwin ]]; then - conda install -yq ffmpeg=4.2 -c pytorch - conda install -yq wget - else - # pushd ext_libraries - # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2 - # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2 - # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2 - # ldconfig - # which ffmpeg - # popd - echo "FFmpeg is disabled currently on Linux" - fi - fi -} diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh new file mode 100644 index 00000000000..7aefa2649e6 --- /dev/null +++ b/packaging/post_build_script.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -euxo pipefail + +if [ -n "${CUDA_HOME:-}" ]; then + LD_LIBRARY_PATH="/usr/local/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" +fi + +python packaging/wheel/relocate.py + +if [[ "$(uname)" == "Linux" && "$(uname -m)" != "aarch64" ]]; then + extra_decoders_channel="--pre --index-url https://download.pytorch.org/whl/nightly/cpu" +else + extra_decoders_channel="" +fi + +pip install torchvision-extra-decoders $extra_decoders_channel diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh new file mode 100644 index 00000000000..fcacf4bf8a4 --- /dev/null +++ b/packaging/pre_build_script.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +if [[ "$(uname)" == Darwin ]]; then + # Uninstall Conflicting jpeg brew formulae + jpeg_packages=$(brew list | grep jpeg) + echo "Existing Jpeg-related Brew libraries" + echo $jpeg_packages + for pkg in $jpeg_packages; do + brew uninstall --ignore-dependencies --force $pkg || true + done + + conda install -y wget +fi + +if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then + conda install libpng libwebp -y + # Installing webp also installs a non-turbo jpeg, so we uninstall jpeg stuff + # before re-installing them + conda uninstall libjpeg-turbo libjpeg -y + conda install -y ffmpeg=4.2 -c pytorch + conda install -y libjpeg-turbo -c pytorch + + # Copy binaries to be included in the wheel distribution + if [[ "$OSTYPE" == "msys" ]]; then + python_exec="$(which python)" + bin_path=$(dirname $python_exec) + cp "$bin_path/Library/bin/libjpeg.dll" torchvision + fi +else + + if [[ "$ARCH" == "aarch64" ]]; then + conda install libpng -y + conda install -y ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly + fi + + conda install libwebp -y + conda install libjpeg-turbo -c pytorch + yum install -y freetype gnutls + pip install "auditwheel<6.3.0" +fi + +pip install numpy pyyaml future ninja +pip install --upgrade setuptools==72.1.0 diff --git a/packaging/pre_build_script_arm64.sh b/packaging/pre_build_script_arm64.sh new file mode 100644 index 00000000000..b157139c26a --- /dev/null +++ b/packaging/pre_build_script_arm64.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +echo "Building vision dependencies and wheel started." + +# Set environment variables +export SRC_PATH="$GITHUB_WORKSPACE/$SRC_DIR" +export CMAKE_BUILD_TYPE="$BUILD_TYPE" +export VCVARSALL_PATH="$DEPENDENCIES_DIR/VSBuildTools/VC/Auxiliary/Build/vcvarsall.bat" +export TRIPLET_FILE="triplets/arm64-windows.cmake" +export PYTORCH_VERSION="$PYTORCH_VERSION" +export CHANNEL="$CHANNEL" + +# Dependencies +mkdir -p "$DOWNLOADS_DIR" +mkdir -p "$DEPENDENCIES_DIR" +echo "*" > "$DOWNLOADS_DIR/.gitignore" +echo "*" > "$DEPENDENCIES_DIR/.gitignore" + +# Install vcpkg +cd "$DOWNLOADS_DIR" || exit +git clone https://github.com/microsoft/vcpkg.git +cd vcpkg || exit +./bootstrap-vcpkg.sh + +# Set vcpkg to only build release packages +echo "set(VCPKG_BUILD_TYPE release)" >> "$TRIPLET_FILE" + +# Install dependencies using vcpkg +./vcpkg install libjpeg-turbo:arm64-windows --x-install-root="$DEPENDENCIES_DIR" +./vcpkg install libwebp:arm64-windows --x-install-root="$DEPENDENCIES_DIR" +./vcpkg install libpng[tools]:arm64-windows --x-install-root="$DEPENDENCIES_DIR" + +# Copy files using cp +cp "$DEPENDENCIES_DIR/arm64-windows/lib/libpng16.lib" "$DEPENDENCIES_DIR/arm64-windows/lib/libpng.lib" +cp "$DEPENDENCIES_DIR/arm64-windows/bin/libpng16.dll" "$DEPENDENCIES_DIR/arm64-windows/bin/libpng.dll" +cp "$DEPENDENCIES_DIR/arm64-windows/bin/libpng16.pdb" "$DEPENDENCIES_DIR/arm64-windows/bin/libpng.pdb" +mkdir -p "$DEPENDENCIES_DIR/Library/" +cp -r "$DEPENDENCIES_DIR/arm64-windows/"* "$DEPENDENCIES_DIR/Library/" +cp -r "$DEPENDENCIES_DIR/Library/tools/libpng/"* "$DEPENDENCIES_DIR/Library/bin/" +cp -r "$DEPENDENCIES_DIR/Library/bin/"* "$SRC_PATH/torchvision" + +# Source directory +cd "$SRC_PATH" || exit + +# Create virtual environment +python -m pip install --upgrade pip +python -m venv .venv +echo "*" > .venv/.gitignore +source .venv/Scripts/activate + +# Install dependencies +pip install numpy==2.2.3 + +if [ "$CHANNEL" = "release" ]; then + echo "Installing latest stable version of PyTorch." + # TODO: update when arm64 torch available on pypi + pip3 install --pre torch --index-url https://download.pytorch.org/whl/torch/ +elif [ "$CHANNEL" = "test" ]; then + echo "Installing PyTorch version $PYTORCH_VERSION." + pip3 install --pre torch=="$PYTORCH_VERSION" --index-url https://download.pytorch.org/whl/test +else + echo "CHANNEL is not set, installing PyTorch from nightly." + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu +fi + +echo "Dependencies install finished successfully." diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml deleted file mode 100644 index 52b95952ddf..00000000000 --- a/packaging/torchvision/conda_build_config.yaml +++ /dev/null @@ -1,25 +0,0 @@ -channel_sources: - - pytorch-nightly,pytorch,defaults -blas_impl: - - mkl # [x86_64] -c_compiler: - - vs2017 # [win] -cxx_compiler: - - vs2017 # [win] -python: - - 3.7 -# This differs from target_platform in that it determines what subdir the compiler -# will target, not what subdir the compiler package will be itself. -# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 -# code on win-64 miniconda. -cross_compiler_target_platform: - - win-64 # [win] -target_platform: - - win-64 # [win] -vc: - - 14 -zip_keys: - - # [win] - - vc # [win] - - c_compiler # [win] - - cxx_compiler # [win] diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml deleted file mode 100644 index 105e28c453e..00000000000 --- a/packaging/torchvision/meta.yaml +++ /dev/null @@ -1,72 +0,0 @@ -{% set build_variant = environ.get('CONDA_BUILD_VARIANT', 'cpu') %} -package: - name: torchvision - version: "{{ environ.get('BUILD_VERSION') }}" - -source: - path: "{{ environ.get('SOURCE_ROOT_DIR') }}" - -requirements: - build: - - {{ compiler('c') }} # [win] - - libpng - - jpeg - # NOTE: The only ffmpeg version that we build is actually 4.2 - - ffmpeg >=4.2 # [not win] - - host: - - python - - setuptools - - pytorch-mutex 1.0 {{ build_variant }} # [not osx ] - {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }} - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }} - - run: - - python - - defaults::numpy >=1.11 - - requests - - libpng - - ffmpeg >=4.2 # [not win] - - jpeg - - pillow >=5.3.0, !=8.3.* - - pytorch-mutex 1.0 {{ build_variant }} # [not osx ] - {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} - {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT', '') }} - - {% if build_variant == 'cpu' %} - run_constrained: - - cpuonly - {% elif not osx %} - run_constrained: - - cpuonly <0 - {% endif %} - -build: - string: py{{py}}_{{ environ['CU_VERSION'] }} - script: python setup.py install --single-version-externally-managed --record=record.txt - script_env: - - CUDA_HOME - - FORCE_CUDA - - BUILD_VERSION - - TORCH_CUDA_ARCH_LIST - - MACOSX_DEPLOYMENT_TARGET - -test: - imports: - - torchvision - - torchvision.datasets - - torchvision.transforms - source_files: - - test - requires: - - pytest - - scipy - - jpeg - - ca-certificates - - -about: - home: https://github.com/pytorch/vision - license: BSD - license_file: LICENSE - summary: 'image and video datasets and models for torch deep learning' diff --git a/packaging/vs2017/activate.bat b/packaging/vs2017/activate.bat deleted file mode 100644 index ccecfc25442..00000000000 --- a/packaging/vs2017/activate.bat +++ /dev/null @@ -1,44 +0,0 @@ -:: Set env vars that tell distutils to use the compiler that we put on path -SET DISTUTILS_USE_SDK=1 -SET MSSdk=1 - -SET "VS_VERSION=15.0" -SET "VS_MAJOR=15" -SET "VS_YEAR=2017" - -set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out" -set "MSYS2_ENV_CONV_EXCL=CL" - -:: For Python 3.5+, ensure that we link with the dynamic runtime. See -:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info -set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VSINSTALLDIR=%%i\" - goto :vswhere - ) -) - -:vswhere - -:: Shorten PATH to avoid the `input line too long` error. -SET MyPath=%PATH% - -setlocal EnableDelayedExpansion - -SET TempPath="%MyPath:;=";"%" -SET var= -FOR %%a IN (%TempPath%) DO ( - IF EXIST %%~sa ( - SET "var=!var!;%%~sa" - ) -) - -set "TempPath=!var:~1!" -endlocal & set "PATH=%TempPath%" - -:: Shorten current directory too -FOR %%A IN (.) DO CD "%%~sA" - -:: other things added by install_activate.bat at package build time diff --git a/packaging/vs2017/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml deleted file mode 100644 index 2479ceb3e76..00000000000 --- a/packaging/vs2017/conda_build_config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -blas_impl: - - mkl # [x86_64] -c_compiler: - - vs2017 # [win] -cxx_compiler: - - vs2017 # [win] -python: - - 3.7 -# This differs from target_platform in that it determines what subdir the compiler -# will target, not what subdir the compiler package will be itself. -# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 -# code on win-64 miniconda. -cross_compiler_target_platform: - - win-64 # [win] -target_platform: - - win-64 # [win] -vc: - - 14 -zip_keys: - - # [win] - - vc # [win] - - c_compiler # [win] - - cxx_compiler # [win] diff --git a/packaging/vs2017/install_activate.bat b/packaging/vs2017/install_activate.bat deleted file mode 100644 index 253d2f2c2c1..00000000000 --- a/packaging/vs2017/install_activate.bat +++ /dev/null @@ -1,29 +0,0 @@ -set YEAR=2017 -set VER=15 - -mkdir "%PREFIX%\etc\conda\activate.d" -COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - -IF "%cross_compiler_target_platform%" == "win-64" ( - set "target_platform=amd64" - echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - IF "%VSDEVCMD_ARGS%" == "" ( - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) ELSE ( - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) else ( - set "target_platform=x86" - echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd - ) diff --git a/packaging/vs2017/install_runtime.bat b/packaging/vs2017/install_runtime.bat deleted file mode 100644 index 5163c16cf24..00000000000 --- a/packaging/vs2017/install_runtime.bat +++ /dev/null @@ -1,49 +0,0 @@ -set VC_PATH=x86 -if "%ARCH%"=="64" ( - set VC_PATH=x64 -) - -set MSC_VER=2017 - -rem :: This should always be present for VC installed with VS. Not sure about VC installed with Visual C++ Build Tools 2015 -rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO ( -rem set SP=%%A -rem ) - -rem if not "%SP%" == "%PKG_VERSION%" ( -rem echo "Version detected from registry: %SP%" -rem echo "does not match version of package being built (%PKG_VERSION%)" -rem echo "Do you have current updates for VS 2015 installed?" -rem exit 1 -rem ) - - -REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below! -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%LIBRARY_BIN%" *.dll /E -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%PREFIX%" *.dll /E -if %ERRORLEVEL% GEQ 8 exit 1 - -REM ========== This one comes from visual studio 2017 -set "VC_VER=141" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" - goto :eof - ) -) - -@setlocal -call "%VS15VARSALL%" x64 - -set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%" - -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -@endlocal diff --git a/packaging/vs2017/meta.yaml b/packaging/vs2017/meta.yaml deleted file mode 100644 index 1f569525ee1..00000000000 --- a/packaging/vs2017/meta.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{% set vcver="14.1" %} -{% set vcfeature="14" %} -{% set vsyear="2017" %} -{% set fullver="15.4.27004.2010" %} - -package: - name: vs{{ vsyear }} - version: {{ fullver }} - -build: - skip: True [not win] - script_env: - - VSDEVCMD_ARGS # [win] - -outputs: - - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }} - script: install_activate.bat - track_features: - # VS 2017 is binary-compatible with VS 2015/vc14. Tools are "v141". - strong: - - vc{{ vcfeature }} - about: - summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler - license: BSD 3-clause diff --git a/packaging/vs2019/activate.bat b/packaging/vs2019/activate.bat deleted file mode 100644 index 6f607ba7518..00000000000 --- a/packaging/vs2019/activate.bat +++ /dev/null @@ -1,44 +0,0 @@ -:: Set env vars that tell distutils to use the compiler that we put on path -SET DISTUTILS_USE_SDK=1 -SET MSSdk=1 - -SET "VS_VERSION=16.0" -SET "VS_MAJOR=16" -SET "VS_YEAR=2019" - -set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out" -set "MSYS2_ENV_CONV_EXCL=CL" - -:: For Python 3.5+, ensure that we link with the dynamic runtime. See -:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info -set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VSINSTALLDIR=%%i\" - goto :vswhere - ) -) - -:vswhere - -:: Shorten PATH to avoid the `input line too long` error. -SET MyPath=%PATH% - -setlocal EnableDelayedExpansion - -SET TempPath="%MyPath:;=";"%" -SET var= -FOR %%a IN (%TempPath%) DO ( - IF EXIST %%~sa ( - SET "var=!var!;%%~sa" - ) -) - -set "TempPath=!var:~1!" -endlocal & set "PATH=%TempPath%" - -:: Shorten current directory too -FOR %%A IN (.) DO CD "%%~sA" - -:: other things added by install_activate.bat at package build time diff --git a/packaging/vs2019/conda_build_config.yaml b/packaging/vs2019/conda_build_config.yaml deleted file mode 100644 index 7bd8de2ea5b..00000000000 --- a/packaging/vs2019/conda_build_config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -blas_impl: - - mkl # [x86_64] -c_compiler: - - vs2019 # [win] -cxx_compiler: - - vs2019 # [win] -python: - - 3.7 -# This differs from target_platform in that it determines what subdir the compiler -# will target, not what subdir the compiler package will be itself. -# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 -# code on win-64 miniconda. -cross_compiler_target_platform: - - win-64 # [win] -target_platform: - - win-64 # [win] -vc: - - 14 -zip_keys: - - # [win] - - vc # [win] - - c_compiler # [win] - - cxx_compiler # [win] diff --git a/packaging/vs2019/install_activate.bat b/packaging/vs2019/install_activate.bat deleted file mode 100644 index 9e60ccfd2dc..00000000000 --- a/packaging/vs2019/install_activate.bat +++ /dev/null @@ -1,29 +0,0 @@ -set YEAR=2019 -set VER=16 - -mkdir "%PREFIX%\etc\conda\activate.d" -COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - -IF "%cross_compiler_target_platform%" == "win-64" ( - set "target_platform=amd64" - echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - IF "%VSDEVCMD_ARGS%" == "" ( - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) ELSE ( - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) else ( - set "target_platform=x86" - echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd - ) diff --git a/packaging/vs2019/install_runtime.bat b/packaging/vs2019/install_runtime.bat deleted file mode 100644 index e09a5ccfb0f..00000000000 --- a/packaging/vs2019/install_runtime.bat +++ /dev/null @@ -1,49 +0,0 @@ -set VC_PATH=x86 -if "%ARCH%"=="64" ( - set VC_PATH=x64 -) - -set MSC_VER=2019 - -rem :: This should always be present for VC installed with VS. Not sure about VC installed with Visual C++ Build Tools 2015 -rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO ( -rem set SP=%%A -rem ) - -rem if not "%SP%" == "%PKG_VERSION%" ( -rem echo "Version detected from registry: %SP%" -rem echo "does not match version of package being built (%PKG_VERSION%)" -rem echo "Do you have current updates for VS 2015 installed?" -rem exit 1 -rem ) - - -REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below! -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%LIBRARY_BIN%" *.dll /E -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%PREFIX%" *.dll /E -if %ERRORLEVEL% GEQ 8 exit 1 - -REM ========== This one comes from visual studio 2019 -set "VC_VER=142" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" - goto :eof - ) -) - -@setlocal -call "%VS15VARSALL%" x64 - -set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%" - -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -@endlocal diff --git a/packaging/vs2019/meta.yaml b/packaging/vs2019/meta.yaml deleted file mode 100644 index 94a0ed4db3e..00000000000 --- a/packaging/vs2019/meta.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{% set vcver="14.2" %} -{% set vcfeature="14" %} -{% set vsyear="2019" %} -{% set fullver="15.4.27004.2010" %} - -package: - name: vs{{ vsyear }} - version: {{ fullver }} - -build: - skip: True [not win] - script_env: - - VSDEVCMD_ARGS # [win] - -outputs: - - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }} - script: install_activate.bat - track_features: - # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14. Tools are "v142". - strong: - - vc{{ vcfeature }} - about: - summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler - license: BSD 3-clause diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py index e6a4ef9d458..4587f3798da 100644 --- a/packaging/wheel/relocate.py +++ b/packaging/wheel/relocate.py @@ -2,7 +2,6 @@ import glob import hashlib -import io # Standard library imports import os @@ -16,7 +15,10 @@ # Third party imports if sys.platform == "linux": - from auditwheel.lddtree import lddtree + try: + from auditwheel.lddtree import lddtree + except ImportError: + from auditwheel import lddtree ALLOWLIST = { @@ -65,21 +67,12 @@ PYTHON_VERSION = sys.version_info -def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE): - """Yield pieces of data from a file-like object until EOF.""" - while True: - chunk = file.read(size) - if not chunk: - break - yield chunk - - def rehash(path, blocksize=1 << 20): """Return (hash, length) for path using hashlib.sha256()""" h = hashlib.sha256() length = 0 with open(path, "rb") as f: - for block in read_chunks(f, size=blocksize): + while block := f.read(blocksize): length += len(block) h.update(block) digest = "sha256=" + urlsafe_b64encode(h.digest()).decode("latin1").rstrip("=") @@ -191,7 +184,7 @@ def relocate_elf_library(patchelf, output_dir, output_library, binary): print("Copying dependencies to wheel directory") new_libraries_path = osp.join(output_dir, "torchvision.libs") - os.makedirs(new_libraries_path) + os.makedirs(new_libraries_path, exist_ok=True) new_names = {binary: binary_path} diff --git a/packaging/windows/internal/build_cpp_example.bat b/packaging/windows/internal/build_cpp_example.bat index e3f7afe9f02..129c574e391 100644 --- a/packaging/windows/internal/build_cpp_example.bat +++ b/packaging/windows/internal/build_cpp_example.bat @@ -1,3 +1,3 @@ @echo on set CL=/I"C:\Program Files (x86)\torchvision\include" -msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" hello-world.vcxproj -maxcpucount:%1 +msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" run_model.vcxproj -maxcpucount:%1 diff --git a/packaging/windows/internal/build_frcnn.bat b/packaging/windows/internal/build_frcnn.bat deleted file mode 100644 index 36e3757d01c..00000000000 --- a/packaging/windows/internal/build_frcnn.bat +++ /dev/null @@ -1,3 +0,0 @@ -@echo on -set CL=/I"C:\Program Files (x86)\torchvision\include" -msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" test_frcnn_tracing.vcxproj -maxcpucount:%1 diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat deleted file mode 100644 index de5bbd04515..00000000000 --- a/packaging/windows/internal/cuda_install.bat +++ /dev/null @@ -1,171 +0,0 @@ -@echo on - -if "%CU_VERSION%" == "cpu" ( - echo Skipping for CPU builds - exit /b 0 -) - -set SRC_DIR=%~dp0\.. - -if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" - -rem in unit test workflow, we get CUDA_VERSION, for example 11.1 -if defined CUDA_VERSION ( - set CUDA_VER=%CUDA_VERSION:.=% -) else ( - set CUDA_VER=%CU_VERSION:cu=% -) - -set /a CUDA_VER=%CU_VERSION:cu=% -set CUDA_VER_MAJOR=%CUDA_VER:~0,-1% -set CUDA_VER_MINOR=%CUDA_VER:~-1,1% -set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% -set CUDNN_FOLDER="cuda" -set CUDNN_LIB_FOLDER="lib\x64" - -if %CUDA_VER% EQU 102 goto cuda102 -if %CUDA_VER% EQU 113 goto cuda113 -if %CUDA_VER% EQU 116 goto cuda116 - -echo CUDA %CUDA_VERSION_STR% is not supported -exit /b 1 - -:cuda102 - -if not exist "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" ( - curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.2.89_441.22_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" - set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvjpeg_10.2 nvjpeg_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2" -) - -if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" ( - curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-10.2-windows10-x64-v7.6.5.32.zip --output "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" -) - -rem The below only for cu102, if it's used in other version, e.g. cu111, torch.cuda.is_availabe() would be False. -if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" ( - curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" - if errorlevel 1 exit /b 1 -) - -echo Installing GPU driver DLLs -7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32" - -goto cuda_common - -:cuda113 - -set CUDA_INSTALL_EXE=cuda_11.3.0_465.89_win10.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=thrust_11.3 nvcc_11.3 cuobjdump_11.3 nvprune_11.3 nvprof_11.3 cupti_11.3 cublas_11.3 cublas_dev_11.3 cudart_11.3 cufft_11.3 cufft_dev_11.3 curand_11.3 curand_dev_11.3 cusolver_11.3 cusolver_dev_11.3 cusparse_11.3 cusparse_dev_11.3 npp_11.3 npp_dev_11.3 nvjpeg_11.3 nvjpeg_dev_11.3 nvrtc_11.3 nvrtc_dev_11.3 nvml_dev_11.3" - -) - -set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip -set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive -set CUDNN_LIB_FOLDER="lib" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - - rem Make sure windows path contains zlib dll - curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" - 7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" - xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" -) - -goto cuda_common - -:cuda116 - -set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6" -) - -set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip -set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive -set CUDNN_LIB_FOLDER="lib" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - - rem Make sure windows path contains zlib dll - curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" - 7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" - xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" -) - -goto cuda_common - -:cuda_common - -if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" ( - curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z" - if errorlevel 1 exit /b 1 -) - -echo Installing CUDA toolkit... -7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda" -pushd "%SRC_DIR%\temp_build\cuda" -sc config wuauserv start= disabled -sc stop wuauserv -sc query wuauserv - -start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs" -echo %errorlevel% - -popd - -echo Installing VS integration... -rem It's for VS 2019 -if "%CUDA_VER_MAJOR%" == "10" ( - xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations" -) -if "%CUDA_VER_MAJOR%" == "11" ( - xcopy /Y "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations" -) - -echo Installing NvToolsExt... -7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt" -mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" -mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" -mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" -xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" -xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" -xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" - -echo Setting up environment... -set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%" -set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" -set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" -set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" - -if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" ( - echo CUDA %CUDA_VERSION_STR% installed failed. - echo --------- RunDll32.exe.log - type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log" - echo --------- setup.exe.log ------- - type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log" - exit /b 1 -) - -echo Installing cuDNN... -7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include" - -echo Cleaning temp files -rd /s /q "%SRC_DIR%\temp_build" || ver > nul diff --git a/packaging/windows/internal/driver_update.bat b/packaging/windows/internal/driver_update.bat deleted file mode 100644 index 00b43affc01..00000000000 --- a/packaging/windows/internal/driver_update.bat +++ /dev/null @@ -1,25 +0,0 @@ -set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe" -curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -if errorlevel 1 exit /b 1 - -start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot -if errorlevel 1 exit /b 1 - -del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL - -setlocal EnableDelayedExpansion -set NVIDIA_GPU_EXISTS=0 -for /F "delims=" %%i in ('wmic path win32_VideoController get name') do ( - set GPUS=%%i - if not "x!GPUS:NVIDIA=!" == "x!GPUS!" ( - SET NVIDIA_GPU_EXISTS=1 - goto gpu_check_end - ) -) -:gpu_check_end -endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS% - -if "%NVIDIA_GPU_EXISTS%" == "0" ( - echo "CUDA Driver installation Failed" - exit /b 1 -) diff --git a/packaging/windows/internal/vc_env_helper.bat b/packaging/windows/internal/vc_env_helper.bat index e85a372f93d..1f50b1b05b5 100644 --- a/packaging/windows/internal/vc_env_helper.bat +++ b/packaging/windows/internal/vc_env_helper.bat @@ -1,11 +1,7 @@ @echo on -set VC_VERSION_LOWER=16 -set VC_VERSION_UPPER=17 -if "%VC_YEAR%" == "2017" ( - set VC_VERSION_LOWER=15 - set VC_VERSION_UPPER=16 -) +set VC_VERSION_LOWER=17 +set VC_VERSION_UPPER=18 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( @@ -24,6 +20,8 @@ if "%VSDEVCMD_ARGS%" == "" ( @echo on +if "%CU_VERSION%" == "xpu" call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" + set DISTUTILS_USE_SDK=1 set args=%1 diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh deleted file mode 100644 index cdae18065b9..00000000000 --- a/packaging/windows/internal/vc_install_helper.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -ex - -if [[ "$CU_VERSION" == "cu92" ]]; then - export VC_YEAR=2017 - export VSDEVCMD_ARGS="-vcvars_ver=14.13" - powershell packaging/windows/internal/vs2017_install.ps1 -elif [[ "$CU_VERSION" == "cu100" ]]; then - export VC_YEAR=2017 - export VSDEVCMD_ARGS="" - powershell packaging/windows/internal/vs2017_install.ps1 -else - export VC_YEAR=2019 - export VSDEVCMD_ARGS="" -fi diff --git a/packaging/windows/internal/vs2017_install.ps1 b/packaging/windows/internal/vs2017_install.ps1 deleted file mode 100644 index 3e953de1ab7..00000000000 --- a/packaging/windows/internal/vs2017_install.ps1 +++ /dev/null @@ -1,25 +0,0 @@ -$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe" -$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools", - "--add Microsoft.VisualStudio.Component.VC.Tools.14.13", - "--add Microsoft.Component.MSBuild", - "--add Microsoft.VisualStudio.Component.Roslyn.Compiler", - "--add Microsoft.VisualStudio.Component.TextTemplating", - "--add Microsoft.VisualStudio.Component.VC.CoreIde", - "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest", - "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core", - "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64", - "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81") - -curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe -if ($LASTEXITCODE -ne 0) { - echo "Download of the VS 2017 installer failed" - exit 1 -} - -$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru -Remove-Item -Path vs_installer.exe -Force -$exitCode = $process.ExitCode -if (($exitCode -ne 0) -and ($exitCode -ne 3010)) { - echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]." - exit 1 -} diff --git a/packaging/windows/internal/vs2019_install.ps1 b/packaging/windows/internal/vs2019_install.ps1 deleted file mode 100644 index e436051f0db..00000000000 --- a/packaging/windows/internal/vs2019_install.ps1 +++ /dev/null @@ -1,21 +0,0 @@ -$VS_DOWNLOAD_LINK = "https://aka.ms/vs/16/release/vs_buildtools.exe" -$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools", - "--add Microsoft.Component.MSBuild", - "--add Microsoft.VisualStudio.Component.Roslyn.Compiler", - "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools", - "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest", - "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64") - -curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe -if ($LASTEXITCODE -ne 0) { - echo "Download of the VS 2019 installer failed" - exit 1 -} - -$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru -Remove-Item -Path vs_installer.exe -Force -$exitCode = $process.ExitCode -if (($exitCode -ne 0) -and ($exitCode -ne 3010)) { - echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]." - exit 1 -} diff --git a/pyproject.toml b/pyproject.toml index 8f0be4245bd..61e4a957fc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ first_party_detection = false [tool.black] line-length = 120 -target-version = ["py37"] +target-version = ["py38"] [tool.ufmt] diff --git a/pytest.ini b/pytest.ini index 1dde465d32f..8d52b55d5a6 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,9 +1,9 @@ [pytest] addopts = - # show summary of all tests that did not pass - -ra + # show tests that (f)ailed, (E)rror, or (X)passed in the summary + -rfEX # Make tracebacks shorter - --tb=native + --tb=short # enable all warnings -Wd --ignore=test/test_datasets_download.py diff --git a/references/classification/README.md b/references/classification/README.md index e8d62134ca2..bc481f421ed 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -120,7 +120,7 @@ Here `$MODEL` is one of `efficientnet_v2_s` and `efficientnet_v2_m`. Note that the Small variant had a `$TRAIN_SIZE` of `300` and a `$EVAL_SIZE` of `384`, while the Medium `384` and `480` respectively. Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 4 nodes, each with 8 GPUs (for a total of 32 GPUs), +For generating the pre-trained weights, we trained with 4 nodes, each with 8 GPUs (for a total of 32 GPUs), and `--batch_size 32`. The weights of the Large variant are ported from the original paper rather than trained from scratch. See the `EfficientNet_V2_L_Weights` entry for their exact preprocessing transforms. @@ -135,7 +135,7 @@ torchrun --nproc_per_node=8 train.py\ --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\ --lr-warmup-epochs=5 --lr-warmup-decay=0.1 ``` -Here `$MODEL` is one of `regnet_x_400mf`, `regnet_x_800mf`, `regnet_x_1_6gf`, `regnet_y_400mf`, `regnet_y_800mf` and `regnet_y_1_6gf`. Please note we used learning rate 0.4 for `regent_y_400mf` to get the same Acc@1 as [the paper)(https://arxiv.org/abs/2003.13678). +Here `$MODEL` is one of `regnet_x_400mf`, `regnet_x_800mf`, `regnet_x_1_6gf`, `regnet_y_400mf`, `regnet_y_800mf` and `regnet_y_1_6gf`. Please note we used learning rate 0.4 for `regent_y_400mf` to get the same Acc@1 as [the paper](https://arxiv.org/abs/2003.13678). #### Medium models ``` @@ -167,7 +167,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +For generating the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), and `--batch_size 64`. #### vit_b_32 @@ -180,7 +180,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), and `--batch_size 256`. #### vit_l_16 @@ -193,7 +193,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), and `--batch_size 64`. #### vit_l_32 @@ -206,7 +206,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +For generating the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), and `--batch_size 64`. @@ -221,7 +221,7 @@ torchrun --nproc_per_node=8 train.py\ Here `$MODEL` is one of `convnext_tiny`, `convnext_small`, `convnext_base` and `convnext_large`. Note that each variant had its `--val-resize-size` optimized in a post-training step, see their `Weights` entry for their exact value. Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), and `--batch_size 64`. @@ -245,6 +245,14 @@ Here `$MODEL` is one of `swin_v2_t`, `swin_v2_s` or `swin_v2_b`. Note that `--val-resize-size` was optimized in a post-training step, see their `Weights` entry for the exact value. +### MaxViT +``` +torchrun --nproc_per_node=8 --n_nodes=4 train.py\ +--model $MODEL --epochs 400 --batch-size 128 --opt adamw --lr 3e-3 --weight-decay 0.05 --lr-scheduler cosineannealinglr --lr-min 1e-5 --lr-warmup-method linear --lr-warmup-epochs 32 --label-smoothing 0.1 --mixup-alpha 0.8 --clip-grad-norm 1.0 --interpolation bicubic --auto-augment ta_wide --policy-magnitude 15 --model-ema --val-resize-size 224\ +--val-crop-size 224 --train-crop-size 224 --amp --model-ema-steps 32 --transformer-embedding-decay 0 --sync-bn +``` +Here `$MODEL` is `maxvit_t`. +Note that `--val-resize-size` was not optimized in a post-training step. ### ShuffleNet V2 @@ -281,24 +289,24 @@ For all post training quantized models, the settings are: 2. num_workers: 16 3. batch_size: 32 4. eval_batch_size: 128 -5. backend: 'fbgemm' +5. qbackend: 'fbgemm' ``` -python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' --model='$MODEL' +python train_quantization.py --device='cpu' --post-training-quantize --qbackend='fbgemm' --model='$MODEL' ``` Here `$MODEL` is one of `googlenet`, `inception_v3`, `resnet18`, `resnet50`, `resnext101_32x8d`, `shufflenet_v2_x0_5` and `shufflenet_v2_x1_0`. ### Quantized ShuffleNet V2 -Here are commands that we use to quantized the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models. +Here are commands that we use to quantize the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models. ``` # For shufflenet_v2_x1_5 -python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \ +python train_quantization.py --device='cpu' --post-training-quantize --qbackend='fbgemm' \ --model=shufflenet_v2_x1_5 --weights="ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1" \ --train-crop-size 176 --val-resize-size 232 --data-path /datasets01_ontap/imagenet_full_size/061417/ # For shufflenet_v2_x2_0 -python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \ +python train_quantization.py --device='cpu' --post-training-quantize --qbackend='fbgemm' \ --model=shufflenet_v2_x2_0 --weights="ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1" \ --train-crop-size 176 --val-resize-size 232 --data-path /datasets01_ontap/imagenet_full_size/061417/ ``` @@ -309,7 +317,7 @@ For Mobilenet-v2, the model was trained with quantization aware training, the se 1. num_workers: 16 2. batch_size: 32 3. eval_batch_size: 128 -4. backend: 'qnnpack' +4. qbackend: 'qnnpack' 5. learning-rate: 0.0001 6. num_epochs: 90 7. num_observer_update_epochs:4 @@ -331,7 +339,7 @@ For Mobilenet-v3 Large, the model was trained with quantization aware training, 1. num_workers: 16 2. batch_size: 32 3. eval_batch_size: 128 -4. backend: 'qnnpack' +4. qbackend: 'qnnpack' 5. learning-rate: 0.001 6. num_epochs: 90 7. num_observer_update_epochs:4 @@ -351,7 +359,7 @@ For post training quant, device is set to CPU. For training, the device is set t ### Command to evaluate quantized models using the pre-trained weights: ``` -python train_quantization.py --device='cpu' --test-only --backend='' --model='' +python train_quantization.py --device='cpu' --test-only --qbackend='' --model='' ``` For inception_v3 you need to pass the following extra parameters: diff --git a/references/classification/presets.py b/references/classification/presets.py index 6bc38e72953..8653957a576 100644 --- a/references/classification/presets.py +++ b/references/classification/presets.py @@ -1,9 +1,23 @@ import torch -from torchvision.transforms import autoaugment, transforms from torchvision.transforms.functional import InterpolationMode +def get_module(use_v2): + # We need a protected import to avoid the V2 warning in case just V1 is used + if use_v2: + import torchvision.transforms.v2 + + return torchvision.transforms.v2 + else: + import torchvision.transforms + + return torchvision.transforms + + class ClassificationPresetTrain: + # Note: this transform assumes that the input to forward() are always PIL + # images, regardless of the backend parameter. We may change that in the + # future though, if we change the output type from the dataset. def __init__( self, *, @@ -13,32 +27,51 @@ def __init__( interpolation=InterpolationMode.BILINEAR, hflip_prob=0.5, auto_augment_policy=None, + ra_magnitude=9, + augmix_severity=3, random_erase_prob=0.0, + backend="pil", + use_v2=False, ): - trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] + T = get_module(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}") + + transforms.append(T.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True)) if hflip_prob > 0: - trans.append(transforms.RandomHorizontalFlip(hflip_prob)) + transforms.append(T.RandomHorizontalFlip(hflip_prob)) if auto_augment_policy is not None: if auto_augment_policy == "ra": - trans.append(autoaugment.RandAugment(interpolation=interpolation)) + transforms.append(T.RandAugment(interpolation=interpolation, magnitude=ra_magnitude)) elif auto_augment_policy == "ta_wide": - trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation)) + transforms.append(T.TrivialAugmentWide(interpolation=interpolation)) elif auto_augment_policy == "augmix": - trans.append(autoaugment.AugMix(interpolation=interpolation)) + transforms.append(T.AugMix(interpolation=interpolation, severity=augmix_severity)) else: - aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy) - trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation)) - trans.extend( + aa_policy = T.AutoAugmentPolicy(auto_augment_policy) + transforms.append(T.AutoAugment(policy=aa_policy, interpolation=interpolation)) + + if backend == "pil": + transforms.append(T.PILToTensor()) + + transforms.extend( [ - transforms.PILToTensor(), - transforms.ConvertImageDtype(torch.float), - transforms.Normalize(mean=mean, std=std), + T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float), + T.Normalize(mean=mean, std=std), ] ) if random_erase_prob > 0: - trans.append(transforms.RandomErasing(p=random_erase_prob)) + transforms.append(T.RandomErasing(p=random_erase_prob)) - self.transforms = transforms.Compose(trans) + if use_v2: + transforms.append(T.ToPureTensor()) + + self.transforms = T.Compose(transforms) def __call__(self, img): return self.transforms(img) @@ -53,17 +86,34 @@ def __init__( mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), interpolation=InterpolationMode.BILINEAR, + backend="pil", + use_v2=False, ): + T = get_module(use_v2) + transforms = [] + backend = backend.lower() + if backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}") - self.transforms = transforms.Compose( - [ - transforms.Resize(resize_size, interpolation=interpolation), - transforms.CenterCrop(crop_size), - transforms.PILToTensor(), - transforms.ConvertImageDtype(torch.float), - transforms.Normalize(mean=mean, std=std), - ] - ) + transforms += [ + T.Resize(resize_size, interpolation=interpolation, antialias=True), + T.CenterCrop(crop_size), + ] + + if backend == "pil": + transforms.append(T.PILToTensor()) + + transforms += [ + T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float), + T.Normalize(mean=mean, std=std), + ] + + if use_v2: + transforms.append(T.ToPureTensor()) + + self.transforms = T.Compose(transforms) def __call__(self, img): return self.transforms(img) diff --git a/references/classification/train.py b/references/classification/train.py index 14360b042ed..d52124fcf33 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -7,12 +7,13 @@ import torch import torch.utils.data import torchvision -import transforms +import torchvision.transforms import utils from sampler import RASampler from torch import nn from torch.utils.data.dataloader import default_collate from torchvision.transforms.functional import InterpolationMode +from transforms import get_mixup_cutmix def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None): @@ -113,7 +114,11 @@ def _get_cache_path(filepath): def load_data(traindir, valdir, args): # Data loading code print("Loading data") - val_resize_size, val_crop_size, train_crop_size = args.val_resize_size, args.val_crop_size, args.train_crop_size + val_resize_size, val_crop_size, train_crop_size = ( + args.val_resize_size, + args.val_crop_size, + args.train_crop_size, + ) interpolation = InterpolationMode(args.interpolation) print("Loading training data") @@ -122,10 +127,15 @@ def load_data(traindir, valdir, args): if args.cache_dataset and os.path.exists(cache_path): # Attention, as the transforms are also cached! print(f"Loading dataset_train from {cache_path}") - dataset, _ = torch.load(cache_path) + # TODO: this could probably be weights_only=True + dataset, _ = torch.load(cache_path, weights_only=False) else: + # We need a default value for the variables below because args may come + # from train_quantization.py which doesn't define them. auto_augment_policy = getattr(args, "auto_augment", None) random_erase_prob = getattr(args, "random_erase", 0.0) + ra_magnitude = getattr(args, "ra_magnitude", None) + augmix_severity = getattr(args, "augmix_severity", None) dataset = torchvision.datasets.ImageFolder( traindir, presets.ClassificationPresetTrain( @@ -133,6 +143,10 @@ def load_data(traindir, valdir, args): interpolation=interpolation, auto_augment_policy=auto_augment_policy, random_erase_prob=random_erase_prob, + ra_magnitude=ra_magnitude, + augmix_severity=augmix_severity, + backend=args.backend, + use_v2=args.use_v2, ), ) if args.cache_dataset: @@ -146,14 +160,22 @@ def load_data(traindir, valdir, args): if args.cache_dataset and os.path.exists(cache_path): # Attention, as the transforms are also cached! print(f"Loading dataset_test from {cache_path}") - dataset_test, _ = torch.load(cache_path) + # TODO: this could probably be weights_only=True + dataset_test, _ = torch.load(cache_path, weights_only=False) else: if args.weights and args.test_only: weights = torchvision.models.get_weight(args.weights) - preprocessing = weights.transforms() + preprocessing = weights.transforms(antialias=True) + if args.backend == "tensor": + preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing]) + else: preprocessing = presets.ClassificationPresetEval( - crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation + crop_size=val_crop_size, + resize_size=val_resize_size, + interpolation=interpolation, + backend=args.backend, + use_v2=args.use_v2, ) dataset_test = torchvision.datasets.ImageFolder( @@ -198,16 +220,18 @@ def main(args): val_dir = os.path.join(args.data_path, "val") dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args) - collate_fn = None num_classes = len(dataset.classes) - mixup_transforms = [] - if args.mixup_alpha > 0.0: - mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha)) - if args.cutmix_alpha > 0.0: - mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha)) - if mixup_transforms: - mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) - collate_fn = lambda batch: mixupcutmix(*default_collate(batch)) # noqa: E731 + mixup_cutmix = get_mixup_cutmix( + mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_classes=num_classes, use_v2=args.use_v2 + ) + if mixup_cutmix is not None: + + def collate_fn(batch): + return mixup_cutmix(*default_collate(batch)) + + else: + collate_fn = default_collate + data_loader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, @@ -303,11 +327,11 @@ def main(args): model_ema = None if args.model_ema: - # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at: + # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at: # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123 # # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps) - # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus: + # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus: # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs alpha = 1.0 - args.model_ema_decay @@ -315,7 +339,7 @@ def main(args): model_ema = utils.ExponentialMovingAverage(model_without_ddp, device=device, decay=1.0 - alpha) if args.resume: - checkpoint = torch.load(args.resume, map_location="cpu") + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True) model_without_ddp.load_state_dict(checkpoint["model"]) if not args.test_only: optimizer.load_state_dict(checkpoint["optimizer"]) @@ -448,6 +472,8 @@ def get_args_parser(add_help=True): action="store_true", ) parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)") + parser.add_argument("--ra-magnitude", default=9, type=int, help="magnitude of auto augment policy") + parser.add_argument("--augmix-severity", default=3, type=int, help="severity of augmix policy") parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)") # Mixed precision training parameters @@ -492,7 +518,8 @@ def get_args_parser(add_help=True): "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)" ) parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load") - + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") return parser diff --git a/references/classification/train_quantization.py b/references/classification/train_quantization.py index ed36e13a028..bd324c6eef7 100644 --- a/references/classification/train_quantization.py +++ b/references/classification/train_quantization.py @@ -23,9 +23,9 @@ def main(args): raise RuntimeError("Post training quantization example should not be performed on distributed mode") # Set backend engine to ensure that quantized model runs on the correct kernels - if args.backend not in torch.backends.quantized.supported_engines: - raise RuntimeError("Quantized backend not supported: " + str(args.backend)) - torch.backends.quantized.engine = args.backend + if args.qbackend not in torch.backends.quantized.supported_engines: + raise RuntimeError("Quantized backend not supported: " + str(args.qbackend)) + torch.backends.quantized.engine = args.qbackend device = torch.device(args.device) torch.backends.cudnn.benchmark = True @@ -55,7 +55,7 @@ def main(args): if not (args.test_only or args.post_training_quantize): model.fuse_model(is_qat=True) - model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.backend) + model.qconfig = torch.ao.quantization.get_default_qat_qconfig(args.qbackend) torch.ao.quantization.prepare_qat(model, inplace=True) if args.distributed and args.sync_bn: @@ -74,7 +74,7 @@ def main(args): model_without_ddp = model.module if args.resume: - checkpoint = torch.load(args.resume, map_location="cpu") + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True) model_without_ddp.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) @@ -89,7 +89,7 @@ def main(args): ) model.eval() model.fuse_model(is_qat=False) - model.qconfig = torch.ao.quantization.get_default_qconfig(args.backend) + model.qconfig = torch.ao.quantization.get_default_qconfig(args.qbackend) torch.ao.quantization.prepare(model, inplace=True) # Calibrate first print("Calibrating") @@ -161,7 +161,7 @@ def get_args_parser(add_help=True): parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path") parser.add_argument("--model", default="mobilenet_v2", type=str, help="model name") - parser.add_argument("--backend", default="qnnpack", type=str, help="fbgemm or qnnpack") + parser.add_argument("--qbackend", default="qnnpack", type=str, help="Quantized backend: fbgemm or qnnpack") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( @@ -257,9 +257,17 @@ def get_args_parser(add_help=True): parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)") parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load") + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") + return parser if __name__ == "__main__": args = get_args_parser().parse_args() + if args.backend in ("fbgemm", "qnnpack"): + raise ValueError( + "The --backend parameter has been re-purposed to specify the backend of the transforms (PIL or Tensor) " + "instead of the quantized backend. Please use the --qbackend parameter to specify the quantized backend." + ) main(args) diff --git a/references/classification/transforms.py b/references/classification/transforms.py index 9a8ef7877d6..96236608eec 100644 --- a/references/classification/transforms.py +++ b/references/classification/transforms.py @@ -2,12 +2,35 @@ from typing import Tuple import torch +from presets import get_module from torch import Tensor from torchvision.transforms import functional as F -class RandomMixup(torch.nn.Module): - """Randomly apply Mixup to the provided batch and targets. +def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_classes, use_v2): + transforms_module = get_module(use_v2) + + mixup_cutmix = [] + if mixup_alpha > 0: + mixup_cutmix.append( + transforms_module.MixUp(alpha=mixup_alpha, num_classes=num_classes) + if use_v2 + else RandomMixUp(num_classes=num_classes, p=1.0, alpha=mixup_alpha) + ) + if cutmix_alpha > 0: + mixup_cutmix.append( + transforms_module.CutMix(alpha=cutmix_alpha, num_classes=num_classes) + if use_v2 + else RandomCutMix(num_classes=num_classes, p=1.0, alpha=cutmix_alpha) + ) + if not mixup_cutmix: + return None + + return transforms_module.RandomChoice(mixup_cutmix) + + +class RandomMixUp(torch.nn.Module): + """Randomly apply MixUp to the provided batch and targets. The class implements the data augmentations as described in the paper `"mixup: Beyond Empirical Risk Minimization" `_. @@ -89,8 +112,8 @@ def __repr__(self) -> str: return s -class RandomCutmix(torch.nn.Module): - """Randomly apply Cutmix to the provided batch and targets. +class RandomCutMix(torch.nn.Module): + """Randomly apply CutMix to the provided batch and targets. The class implements the data augmentations as described in the paper `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features" `_. diff --git a/references/classification/utils.py b/references/classification/utils.py index c31f3928e86..7d9f0136ae8 100644 --- a/references/classification/utils.py +++ b/references/classification/utils.py @@ -287,8 +287,7 @@ def average_checkpoints(inputs): for fpath in inputs: with open(fpath, "rb") as f: state = torch.load( - f, - map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")), + f, map_location=(lambda s, _: torch.serialization.default_restore_location(s, "cpu")), weights_only=True ) # Copies over the settings from the first checkpoint if new_state is None: @@ -365,12 +364,12 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T checkpoint_path = os.path.abspath(checkpoint_path) output_dir = os.path.dirname(checkpoint_path) - # Deep copy to avoid side-effects on the model object. + # Deep copy to avoid side effects on the model object. model = copy.deepcopy(model) - checkpoint = torch.load(checkpoint_path, map_location="cpu") + checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True) # Load the weights to the model to validate that everything works - # and remove unnecessary weights (such as auxiliaries, etc) + # and remove unnecessary weights (such as auxiliaries, etc.) if checkpoint_key == "model_ema": del checkpoint[checkpoint_key]["n_averaged"] torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.") diff --git a/references/depth/stereo/README.md b/references/depth/stereo/README.md new file mode 100644 index 00000000000..22bcae27ab0 --- /dev/null +++ b/references/depth/stereo/README.md @@ -0,0 +1,180 @@ +# Stereo Matching reference training scripts + +This folder contains reference training scripts for Stereo Matching. +They serve as a log of how to train specific models, so as to provide baseline +training and evaluation scripts to quickly bootstrap research. + + +### CREStereo + +The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**. +A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well. +Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The +rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo. +The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate +schedule to one that starts decaying the weight much sooner. Throughout the experiments we found that this reduces +overfitting during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change. + +``` +torchrun --nproc_per_node 8 --nnodes 1 train.py \ + --dataset-root $dataset_root \ + --name $name_cre \ + --model crestereo_base \ + --train-datasets crestereo eth3d-train middlebury2014-other \ + --dataset-steps 264000 18000 18000 + --batch-size 2 \ + --lr 0.0004 \ + --min-lr 0.00002 \ + --lr-decay-method cosine \ + --warmup-steps 6000 \ + --decay-after-steps 30000 \ + --clip-grad-norm 1.0 \ +``` + +We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggressive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop. + +``` +torchrun --nproc_per_node 8 --nnodes 1 train.py \ + --dataset-root $dataset_root \ + --name $name_things \ + --model crestereo_base \ + --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \ + --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000 + --batch-size 2 \ + --scale-range 0.2 0.8 \ + --lr 0.0004 \ + --lr-decay-method cosine \ + --decay-after-steps 0 \ + --warmup-steps 0 \ + --min-lr 0.00002 \ + --resume-path $checkpoint_dir/$name_cre.pth +``` + + +### Evaluation + +Evaluating the base weights + +``` +torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1 +``` + +This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations: + +``` +Dataset: middlebury2014-train @size: [384, 512]: +{ + 1: { + 2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511} + 5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128} + 10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388} + 20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7} + }, +} +{ + 2: { + 2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396} + 5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186} + 10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379} + 20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054} + }, +} +``` + +You can also evaluate the Finetuned weights: + +``` +torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1 +``` + +``` +Dataset: middlebury2014-train @size: [384, 512]: +{ + 1: { + 2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736} + 5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596} + 10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042} + 20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784} + }, +} +{ + 2: { + 2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89} + 5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556} + 10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464} + 20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417} + }, +} +``` + +Evaluating the author provided weights: + +``` +torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1 +``` + +``` +Dataset: middlebury2014-train @size: [384, 512]: +{ + 1: { + 2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464} + 5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186} + 10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429} + 20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807} + }, +} +{ + 2: { + 2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313} + 5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979} + 10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43} + 20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269} + }, +} +``` + +# Concerns when training + +We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targeting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence of naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks. + + Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments. + +### Disparity scaling + +##### Sample A + The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`. + +![Disparity1](assets/disparity-domain-drift.jpg) + +From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correctly estimate the continuous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremely large scene the crop size of `384x512` does not correctly capture the general training distribution. + + + + +##### Sample B + +The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exaggerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value. + +![Disparity2](assets/disparity-background-mode-collapse.jpg) + + +For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493 + + +### Metric overfitting + +##### Learning is critical in the beginning + +We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule. + +![Loss1](assets/Loss.jpg) + +In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration. + +##### Gradient norm saves time + +![Loss2](assets/gradient-norm-removal.jpg) + +In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**. + +Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for. diff --git a/references/depth/stereo/__init__.py b/references/depth/stereo/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/references/depth/stereo/assets/Loss.jpg b/references/depth/stereo/assets/Loss.jpg new file mode 100644 index 00000000000..b6db8e204af Binary files /dev/null and b/references/depth/stereo/assets/Loss.jpg differ diff --git a/references/depth/stereo/assets/disparity-background-mode-collapse.jpg b/references/depth/stereo/assets/disparity-background-mode-collapse.jpg new file mode 100644 index 00000000000..b6542e8814f Binary files /dev/null and b/references/depth/stereo/assets/disparity-background-mode-collapse.jpg differ diff --git a/references/depth/stereo/assets/disparity-domain-drift.jpg b/references/depth/stereo/assets/disparity-domain-drift.jpg new file mode 100644 index 00000000000..8a98de03675 Binary files /dev/null and b/references/depth/stereo/assets/disparity-domain-drift.jpg differ diff --git a/references/depth/stereo/assets/gradient-norm-removal.jpg b/references/depth/stereo/assets/gradient-norm-removal.jpg new file mode 100644 index 00000000000..2c3c8459d5e Binary files /dev/null and b/references/depth/stereo/assets/gradient-norm-removal.jpg differ diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py new file mode 100644 index 00000000000..7cb6413f1a5 --- /dev/null +++ b/references/depth/stereo/cascade_evaluation.py @@ -0,0 +1,299 @@ +import os +import warnings + +import torch +import torchvision +import torchvision.prototype.models.depth.stereo +import utils +from torch.nn import functional as F +from train import make_eval_loader + +from utils.metrics import AVAILABLE_METRICS +from visualization import make_prediction_image_side_to_side + + +def get_args_parser(add_help=True): + import argparse + + parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help) + parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use") + parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset") + + parser.add_argument("--checkpoint", type=str, default="", help="path to weights") + parser.add_argument("--weights", type=str, default=None, help="torchvision API weight") + parser.add_argument( + "--model", + type=str, + default="crestereo_base", + help="which model to use if not speciffying a training checkpoint", + ) + parser.add_argument("--img-folder", type=str, default="images") + + parser.add_argument("--batch-size", type=int, default=1, help="batch size") + parser.add_argument("--workers", type=int, default=0, help="number of workers") + + parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size") + parser.add_argument( + "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization" + ) + parser.add_argument( + "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization" + ) + parser.add_argument( + "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False + ) + parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity") + parser.add_argument( + "--interpolation-strategy", + type=str, + default="bilinear", + help="interpolation strategy", + choices=["bilinear", "bicubic", "mixed"], + ) + + parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations") + parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades") + parser.add_argument( + "--metrics", + type=str, + nargs="+", + default=["mae", "rmse", "1px", "3px", "5px", "relepe"], + help="metrics to log", + choices=AVAILABLE_METRICS, + ) + parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training") + + parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes") + parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training") + parser.add_argument("--device", type=str, default="cuda", help="device to use for training") + + parser.add_argument("--save-images", action="store_true", help="save images of the predictions") + parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"]) + + return parser + + +def cascade_inference(model, image_left, image_right, iterations, cascades): + # check that image size is divisible by 16 * (2 ** (cascades - 1)) + for image in [image_left, image_right]: + if image.shape[-2] % ((2 ** (cascades - 1))) != 0: + raise ValueError( + f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}" + ) + + if image.shape[-1] % ((2 ** (cascades - 1))) != 0: + raise ValueError( + f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}" + ) + + left_image_pyramid = [image_left] + right_image_pyramid = [image_right] + for idx in range(0, cascades - 1): + ds_factor = int(2 ** (idx + 1)) + ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor) + left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0) + right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze( + 0 + ) + + flow_init = None + for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)): + flow_pred = model(left_image, right_image, flow_init, num_iters=iterations) + # flow pred is a list + flow_init = flow_pred[-1] + + return flow_init + + +@torch.inference_mode() +def _evaluate( + model, + args, + val_loader, + *, + padder_mode, + print_freq=10, + writer=None, + step=None, + iterations=10, + cascades=1, + batch_size=None, + header=None, + save_images=False, + save_path="", +): + """Helper function to compute various metrics (epe, etc.) for a model on a given dataset. + We process as many samples as possible with ddp. + """ + model.eval() + header = header or "Test:" + device = torch.device(args.device) + metric_logger = utils.MetricLogger(delimiter=" ") + + iterations = iterations or args.recurrent_updates + + logger = utils.MetricLogger() + for meter_name in args.metrics: + logger.add_meter(meter_name, fmt="{global_avg:.4f}") + if "fl-all" not in args.metrics: + logger.add_meter("fl-all", fmt="{global_avg:.4f}") + + num_processed_samples = 0 + with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16): + batch_idx = 0 + for blob in metric_logger.log_every(val_loader, print_freq, header): + image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob) + padder = utils.InputPadder(image_left.shape, mode=padder_mode) + image_left, image_right = padder.pad(image_left, image_right) + + disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades) + disp_pred = disp_pred[:, :1, :, :] + disp_pred = padder.unpad(disp_pred) + + if save_images: + if args.distributed: + rank_prefix = args.rank + else: + rank_prefix = 0 + make_prediction_image_side_to_side( + disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}" + ) + + metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys()) + num_processed_samples += image_left.shape[0] + for name in metrics: + logger.meters[name].update(metrics[name], n=1) + + batch_idx += 1 + + num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size + + print("Num_processed_samples: ", num_processed_samples) + if ( + hasattr(val_loader.dataset, "__len__") + and len(val_loader.dataset) != num_processed_samples + and torch.distributed.get_rank() == 0 + ): + warnings.warn( + f"Number of processed samples {num_processed_samples} is different" + f"from the dataset size {len(val_loader.dataset)}. This may happen if" + "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results." + ) + + if writer is not None and args.rank == 0: + for meter_name, meter_value in logger.meters.items(): + scalar_name = f"{meter_name} {header}" + writer.add_scalar(scalar_name, meter_value.avg, step) + + logger.synchronize_between_processes() + print(header, logger) + + logger_metrics = {k: v.global_avg for k, v in logger.meters.items()} + return logger_metrics + + +def evaluate(model, loader, args, writer=None, step=None): + os.makedirs(args.img_folder, exist_ok=True) + checkpoint_name = os.path.basename(args.checkpoint) or args.weights + image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name) + + metrics = {} + base_image_folder = os.path.join(image_checkpoint_folder, args.dataset) + os.makedirs(base_image_folder, exist_ok=True) + + for n_cascades in args.n_cascades: + for n_iters in args.n_iterations: + + config = f"{n_cascades}c_{n_iters}i" + config_image_folder = os.path.join(base_image_folder, config) + os.makedirs(config_image_folder, exist_ok=True) + + metrics[config] = _evaluate( + model, + args, + loader, + padder_mode=args.padder_type, + header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}", + batch_size=args.batch_size, + writer=writer, + step=step, + iterations=n_iters, + cascades=n_cascades, + save_path=config_image_folder, + save_images=args.save_images, + ) + + metric_log = [] + metric_log_dict = {} + # print the final results + for config in metrics: + config_tokens = config.split("_") + config_iters = config_tokens[1][:-1] + config_cascades = config_tokens[0][:-1] + + metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {}) + metric_log_dict[config_cascades][config_iters] = metrics[config] + + evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}" + metrics_str = f"Metrics: {metrics[config]}" + metric_log.extend([evaluation_str, metrics_str]) + + print(evaluation_str) + print(metrics_str) + + eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log" + print("Saving eval log to: ", eval_log_name) + with open(eval_log_name, "w") as f: + f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n") + # write the dict line by line for each key, and each value in the keys + for config_cascades in metric_log_dict: + f.write("{\n") + f.write(f"\t{config_cascades}: {{\n") + for config_iters in metric_log_dict[config_cascades]: + # convert every metric to 4 decimal places + metrics = metric_log_dict[config_cascades][config_iters] + metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()} + f.write(f"\t\t{config_iters}: {metrics}\n") + f.write("\t},\n") + f.write("}\n") + + +def load_checkpoint(args): + utils.setup_ddp(args) + + if not args.weights: + checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu"), weights_only=True) + if "model" in checkpoint: + experiment_args = checkpoint["args"] + model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None) + model.load_state_dict(checkpoint["model"]) + else: + model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None) + model.load_state_dict(checkpoint) + + # set the appropriate devices + if args.distributed and args.device == "cpu": + raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun") + device = torch.device(args.device) + else: + model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights) + + # convert to DDP if need be + if args.distributed: + model = model.to(args.device) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.to(device) + + return model + + +def main(args): + model = load_checkpoint(args) + loader = make_eval_loader(args.dataset, args) + evaluate(model, loader, args) + + +if __name__ == "__main__": + args = get_args_parser().parse_args() + main(args) diff --git a/references/depth/stereo/parsing.py b/references/depth/stereo/parsing.py new file mode 100644 index 00000000000..71a3ba9904e --- /dev/null +++ b/references/depth/stereo/parsing.py @@ -0,0 +1,89 @@ +import argparse +from functools import partial + +import torch + +from presets import StereoMatchingEvalPreset, StereoMatchingTrainPreset +from torchvision.datasets import ( + CarlaStereo, + CREStereo, + ETH3DStereo, + FallingThingsStereo, + InStereo2k, + Kitti2012Stereo, + Kitti2015Stereo, + Middlebury2014Stereo, + SceneFlowStereo, + SintelStereo, +) + +VALID_DATASETS = { + "crestereo": partial(CREStereo), + "carla-highres": partial(CarlaStereo), + "instereo2k": partial(InStereo2k), + "sintel": partial(SintelStereo), + "sceneflow-monkaa": partial(SceneFlowStereo, variant="Monkaa", pass_name="both"), + "sceneflow-flyingthings": partial(SceneFlowStereo, variant="FlyingThings3D", pass_name="both"), + "sceneflow-driving": partial(SceneFlowStereo, variant="Driving", pass_name="both"), + "fallingthings": partial(FallingThingsStereo, variant="both"), + "eth3d-train": partial(ETH3DStereo, split="train"), + "eth3d-test": partial(ETH3DStereo, split="test"), + "kitti2015-train": partial(Kitti2015Stereo, split="train"), + "kitti2015-test": partial(Kitti2015Stereo, split="test"), + "kitti2012-train": partial(Kitti2012Stereo, split="train"), + "kitti2012-test": partial(Kitti2012Stereo, split="train"), + "middlebury2014-other": partial( + Middlebury2014Stereo, split="additional", use_ambient_view=True, calibration="both" + ), + "middlebury2014-train": partial(Middlebury2014Stereo, split="train", calibration="perfect"), + "middlebury2014-test": partial(Middlebury2014Stereo, split="test", calibration=None), + "middlebury2014-train-ambient": partial( + Middlebury2014Stereo, split="train", use_ambient_views=True, calibrartion="perfect" + ), +} + + +def make_train_transform(args: argparse.Namespace) -> torch.nn.Module: + return StereoMatchingTrainPreset( + resize_size=args.resize_size, + crop_size=args.crop_size, + rescale_prob=args.rescale_prob, + scaling_type=args.scaling_type, + scale_range=args.scale_range, + scale_interpolation_type=args.interpolation_strategy, + use_grayscale=args.use_grayscale, + mean=args.norm_mean, + std=args.norm_std, + horizontal_flip_prob=args.flip_prob, + gpu_transforms=args.gpu_transforms, + max_disparity=args.max_disparity, + spatial_shift_prob=args.spatial_shift_prob, + spatial_shift_max_angle=args.spatial_shift_max_angle, + spatial_shift_max_displacement=args.spatial_shift_max_displacement, + spatial_shift_interpolation_type=args.interpolation_strategy, + gamma_range=args.gamma_range, + brightness=args.brightness_range, + contrast=args.contrast_range, + saturation=args.saturation_range, + hue=args.hue_range, + asymmetric_jitter_prob=args.asymmetric_jitter_prob, + ) + + +def make_eval_transform(args: argparse.Namespace) -> torch.nn.Module: + if args.eval_size is None: + resize_size = args.crop_size + else: + resize_size = args.eval_size + + return StereoMatchingEvalPreset( + mean=args.norm_mean, + std=args.norm_std, + use_grayscale=args.use_grayscale, + resize_size=resize_size, + interpolation_type=args.interpolation_strategy, + ) + + +def make_dataset(dataset_name: str, dataset_root: str, transforms: torch.nn.Module) -> torch.utils.data.Dataset: + return VALID_DATASETS[dataset_name](root=dataset_root, transforms=transforms) diff --git a/references/depth/stereo/presets.py b/references/depth/stereo/presets.py new file mode 100644 index 00000000000..cadd2405178 --- /dev/null +++ b/references/depth/stereo/presets.py @@ -0,0 +1,144 @@ +from typing import Optional, Tuple, Union + +import torch +import transforms as T + + +class StereoMatchingEvalPreset(torch.nn.Module): + def __init__( + self, + mean: float = 0.5, + std: float = 0.5, + resize_size: Optional[Tuple[int, ...]] = None, + max_disparity: Optional[float] = None, + interpolation_type: str = "bilinear", + use_grayscale: bool = False, + ) -> None: + super().__init__() + + transforms = [ + T.ToTensor(), + T.ConvertImageDtype(torch.float32), + ] + + if use_grayscale: + transforms.append(T.ConvertToGrayscale()) + + if resize_size is not None: + transforms.append(T.Resize(resize_size, interpolation_type=interpolation_type)) + + transforms.extend( + [ + T.Normalize(mean=mean, std=std), + T.MakeValidDisparityMask(max_disparity=max_disparity), + T.ValidateModelInput(), + ] + ) + + self.transforms = T.Compose(transforms) + + def forward(self, images, disparities, masks): + return self.transforms(images, disparities, masks) + + +class StereoMatchingTrainPreset(torch.nn.Module): + def __init__( + self, + *, + resize_size: Optional[Tuple[int, ...]], + resize_interpolation_type: str = "bilinear", + # RandomResizeAndCrop params + crop_size: Tuple[int, int], + rescale_prob: float = 1.0, + scaling_type: str = "exponential", + scale_range: Tuple[float, float] = (-0.2, 0.5), + scale_interpolation_type: str = "bilinear", + # convert to grayscale + use_grayscale: bool = False, + # normalization params + mean: float = 0.5, + std: float = 0.5, + # processing device + gpu_transforms: bool = False, + # masking + max_disparity: Optional[int] = 256, + # SpatialShift params + spatial_shift_prob: float = 0.5, + spatial_shift_max_angle: float = 0.5, + spatial_shift_max_displacement: float = 0.5, + spatial_shift_interpolation_type: str = "bilinear", + # AssymetricColorJitter + gamma_range: Tuple[float, float] = (0.8, 1.2), + brightness: Union[int, Tuple[int, int]] = (0.8, 1.2), + contrast: Union[int, Tuple[int, int]] = (0.8, 1.2), + saturation: Union[int, Tuple[int, int]] = 0.0, + hue: Union[int, Tuple[int, int]] = 0.0, + asymmetric_jitter_prob: float = 1.0, + # RandomHorizontalFlip + horizontal_flip_prob: float = 0.5, + # RandomOcclusion + occlusion_prob: float = 0.0, + occlusion_px_range: Tuple[int, int] = (50, 100), + # RandomErase + erase_prob: float = 0.0, + erase_px_range: Tuple[int, int] = (50, 100), + erase_num_repeats: int = 1, + ) -> None: + + if scaling_type not in ["linear", "exponential"]: + raise ValueError(f"Unknown scaling type: {scaling_type}. Available types: linear, exponential") + + super().__init__() + transforms = [T.ToTensor()] + + # when fixing size across multiple datasets, we ensure + # that the same size is used for all datasets when cropping + if resize_size is not None: + transforms.append(T.Resize(resize_size, interpolation_type=resize_interpolation_type)) + + if gpu_transforms: + transforms.append(T.ToGPU()) + + # color handling + color_transforms = [ + T.AsymmetricColorJitter( + brightness=brightness, contrast=contrast, saturation=saturation, hue=hue, p=asymmetric_jitter_prob + ), + T.AsymetricGammaAdjust(p=asymmetric_jitter_prob, gamma_range=gamma_range), + ] + + if use_grayscale: + color_transforms.append(T.ConvertToGrayscale()) + + transforms.extend(color_transforms) + + transforms.extend( + [ + T.RandomSpatialShift( + p=spatial_shift_prob, + max_angle=spatial_shift_max_angle, + max_px_shift=spatial_shift_max_displacement, + interpolation_type=spatial_shift_interpolation_type, + ), + T.ConvertImageDtype(torch.float32), + T.RandomRescaleAndCrop( + crop_size=crop_size, + scale_range=scale_range, + rescale_prob=rescale_prob, + scaling_type=scaling_type, + interpolation_type=scale_interpolation_type, + ), + T.RandomHorizontalFlip(horizontal_flip_prob), + # occlusion after flip, otherwise we're occluding the reference image + T.RandomOcclusion(p=occlusion_prob, occlusion_px_range=occlusion_px_range), + T.RandomErase(p=erase_prob, erase_px_range=erase_px_range, max_erase=erase_num_repeats), + T.Normalize(mean=mean, std=std), + T.MakeValidDisparityMask(max_disparity), + T.ValidateModelInput(), + ] + ) + + self.transforms = T.Compose(transforms) + + def forward(self, images, disparties, mask): + return self.transforms(images, disparties, mask) diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py new file mode 100644 index 00000000000..83db313ae80 --- /dev/null +++ b/references/depth/stereo/train.py @@ -0,0 +1,788 @@ +import argparse +import os +import warnings +from pathlib import Path +from typing import List, Union + +import numpy.typing as npt +import torch +import torch.distributed as dist +import torchvision.models.optical_flow +import torchvision.prototype.models.depth.stereo +import utils +import visualization + +from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS +from torch import nn +from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize +from utils.metrics import AVAILABLE_METRICS +from utils.norm import freeze_batch_norm + + +def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor: + """Helper function to make stereo flow from a given model output""" + if isinstance(flow, list): + return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow] + + B, C, H, W = flow.shape + # we need to add zero flow if the model outputs 2 channels + if C == 1 and model_out_channels == 2: + zero_flow = torch.zeros_like(flow) + # by convention the flow is X-Y axis, so we need the Y flow last + flow = torch.cat([flow, zero_flow], dim=1) + return flow + + +def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> npt.NDArray: + """Helper function to return a learning rate scheduler for CRE-stereo""" + if args.decay_after_steps < args.warmup_steps: + raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}") + + warmup_steps = args.warmup_steps if args.warmup_steps else 0 + flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0 + decay_lr_steps = args.total_iterations - flat_lr_steps + + max_lr = args.lr + min_lr = args.min_lr + + schedulers = [] + milestones = [] + + if warmup_steps > 0: + if args.lr_warmup_method == "linear": + warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps + ) + elif args.lr_warmup_method == "constant": + warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR( + optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps + ) + else: + raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}") + schedulers.append(warmup_lr_scheduler) + milestones.append(warmup_steps) + + if flat_lr_steps > 0: + flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps) + schedulers.append(flat_lr_scheduler) + milestones.append(flat_lr_steps + warmup_steps) + + if decay_lr_steps > 0: + if args.lr_decay_method == "cosine": + decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=decay_lr_steps, eta_min=min_lr + ) + elif args.lr_decay_method == "linear": + decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps + ) + elif args.lr_decay_method == "exponential": + decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( + optimizer, gamma=args.lr_decay_gamma, last_epoch=-1 + ) + else: + raise ValueError(f"Unknown lr decay method {args.lr_decay_method}") + schedulers.append(decay_lr_scheduler) + + scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones) + return scheduler + + +def shuffle_dataset(dataset): + """Shuffle the dataset""" + perm = torch.randperm(len(dataset)) + return torch.utils.data.Subset(dataset, perm) + + +def resize_dataset_to_n_steps( + dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace +) -> torch.utils.data.Dataset: + original_size = len(dataset) + if args.steps_is_epochs: + samples_per_step = original_size + target_size = dataset_steps * samples_per_step + + dataset_copies = [] + n_expands, remainder = divmod(target_size, original_size) + for idx in range(n_expands): + dataset_copies.append(dataset) + + if remainder > 0: + dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder)))) + + if args.dataset_shuffle: + dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies] + + dataset = torch.utils.data.ConcatDataset(dataset_copies) + return dataset + + +def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset: + datasets = [] + for dataset_name in args.train_datasets: + transform = make_train_transform(args) + dataset = make_dataset(dataset_name, dataset_root, transform) + datasets.append(dataset) + + if len(datasets) == 0: + raise ValueError("No datasets specified for training") + + samples_per_step = args.world_size * args.batch_size + + for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)): + datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args) + + dataset = torch.utils.data.ConcatDataset(datasets) + if args.dataset_order_shuffle: + dataset = shuffle_dataset(dataset) + + print(f"Training dataset: {len(dataset)} samples") + return dataset + + +@torch.inference_mode() +def _evaluate( + model, + args, + val_loader, + *, + padder_mode, + print_freq=10, + writer=None, + step=None, + iterations=None, + batch_size=None, + header=None, +): + """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.""" + model.eval() + header = header or "Test:" + device = torch.device(args.device) + metric_logger = utils.MetricLogger(delimiter=" ") + + iterations = iterations or args.recurrent_updates + + logger = utils.MetricLogger() + for meter_name in args.metrics: + logger.add_meter(meter_name, fmt="{global_avg:.4f}") + if "fl-all" not in args.metrics: + logger.add_meter("fl-all", fmt="{global_avg:.4f}") + + num_processed_samples = 0 + with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16): + for blob in metric_logger.log_every(val_loader, print_freq, header): + image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob) + padder = utils.InputPadder(image_left.shape, mode=padder_mode) + image_left, image_right = padder.pad(image_left, image_right) + + disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations) + disp_pred = disp_predictions[-1][:, :1, :, :] + disp_pred = padder.unpad(disp_pred) + + metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys()) + num_processed_samples += image_left.shape[0] + for name in metrics: + logger.meters[name].update(metrics[name], n=1) + + num_processed_samples = utils.reduce_across_processes(num_processed_samples) + + print("Num_processed_samples: ", num_processed_samples) + if ( + hasattr(val_loader.dataset, "__len__") + and len(val_loader.dataset) != num_processed_samples + and torch.distributed.get_rank() == 0 + ): + warnings.warn( + f"Number of processed samples {num_processed_samples} is different" + f"from the dataset size {len(val_loader.dataset)}. This may happen if" + "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results." + ) + + if writer is not None and args.rank == 0: + for meter_name, meter_value in logger.meters.items(): + scalar_name = f"{meter_name} {header}" + writer.add_scalar(scalar_name, meter_value.avg, step) + + logger.synchronize_between_processes() + print(header, logger) + + +def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader: + if args.weights: + weights = torchvision.models.get_weight(args.weights) + trans = weights.transforms() + + def preprocessing(image_left, image_right, disp, valid_disp_mask): + C_o, H_o, W_o = get_dimensions(image_left) + image_left, image_right = trans(image_left, image_right) + + C_t, H_t, W_t = get_dimensions(image_left) + scale_factor = W_t / W_o + + if disp is not None and not isinstance(disp, torch.Tensor): + disp = torch.from_numpy(disp) + if W_t != W_o: + disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor + if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor): + valid_disp_mask = torch.from_numpy(valid_disp_mask) + if W_t != W_o: + valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST) + return image_left, image_right, disp, valid_disp_mask + + else: + preprocessing = make_eval_transform(args) + + val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing) + if args.distributed: + sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False) + else: + sampler = torch.utils.data.SequentialSampler(val_dataset) + + val_loader = torch.utils.data.DataLoader( + val_dataset, + sampler=sampler, + batch_size=args.batch_size, + pin_memory=True, + num_workers=args.workers, + ) + + return val_loader + + +def evaluate(model, loaders, args, writer=None, step=None): + for loader_name, loader in loaders.items(): + _evaluate( + model, + args, + loader, + iterations=args.recurrent_updates, + padder_mode=args.padder_type, + header=f"{loader_name} evaluation", + batch_size=args.batch_size, + writer=writer, + step=step, + ) + + +def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args): + device = torch.device(args.device) + # wrap the loader in a logger + loader = iter(logger.log_every(train_loader)) + # output channels + model_out_channels = model.module.output_channels if args.distributed else model.output_channels + + torch.set_num_threads(args.threads) + + sequence_criterion = utils.SequenceLoss( + gamma=args.gamma, + max_flow=args.max_disparity, + exclude_large_flows=args.flow_loss_exclude_large, + ).to(device) + + if args.consistency_weight: + consistency_criterion = utils.FlowSequenceConsistencyLoss( + args.gamma, + resize_factor=0.25, + rescale_factor=0.25, + rescale_mode="bilinear", + ).to(device) + else: + consistency_criterion = None + + if args.psnr_weight: + psnr_criterion = utils.PSNRLoss().to(device) + else: + psnr_criterion = None + + if args.smoothness_weight: + smoothness_criterion = utils.SmoothnessLoss().to(device) + else: + smoothness_criterion = None + + if args.photometric_weight: + photometric_criterion = utils.FlowPhotoMetricLoss( + ssim_weight=args.photometric_ssim_weight, + max_displacement_ratio=args.photometric_max_displacement_ratio, + ssim_use_padding=False, + ).to(device) + else: + photometric_criterion = None + + for step in range(args.start_step + 1, args.total_iterations + 1): + data_blob = next(loader) + optimizer.zero_grad() + + # unpack the data blob + image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob) + with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16): + disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates) + # different models have different outputs, make sure we get the right ones for this task + disp_predictions = make_stereo_flow(disp_predictions, model_out_channels) + # should the architecture or training loop require it, we have to adjust the disparity mask + # target to possibly look like an optical flow mask + disp_mask = make_stereo_flow(disp_mask, model_out_channels) + # sequence loss on top of the model outputs + + loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight + + if args.consistency_weight > 0: + loss_consistency = consistency_criterion(disp_predictions) + loss += loss_consistency * args.consistency_weight + + if args.psnr_weight > 0: + loss_psnr = 0.0 + for pred in disp_predictions: + # predictions might have 2 channels + loss_psnr += psnr_criterion( + pred * valid_disp_mask.unsqueeze(1), + disp_mask * valid_disp_mask.unsqueeze(1), + ).mean() # mean the psnr loss over the batch + loss += loss_psnr / len(disp_predictions) * args.psnr_weight + + if args.photometric_weight > 0: + loss_photometric = 0.0 + for pred in disp_predictions: + # predictions might have 1 channel, therefore we need to inpute 0s for the second channel + if model_out_channels == 1: + pred = torch.cat([pred, torch.zeros_like(pred)], dim=1) + + loss_photometric += photometric_criterion( + image_left, image_right, pred, valid_disp_mask + ) # photometric loss already comes out meaned over the batch + loss += loss_photometric / len(disp_predictions) * args.photometric_weight + + if args.smoothness_weight > 0: + loss_smoothness = 0.0 + for pred in disp_predictions: + # predictions might have 2 channels + loss_smoothness += smoothness_criterion( + image_left, pred[:, :1, :, :] + ).mean() # mean the smoothness loss over the batch + loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight + + with torch.no_grad(): + metrics, _ = utils.compute_metrics( + disp_predictions[-1][:, :1, :, :], # predictions might have 2 channels + disp_mask[:, :1, :, :], # so does the ground truth + valid_disp_mask, + args.metrics, + ) + + metrics.pop("fl-all", None) + logger.update(loss=loss, **metrics) + + if scaler is not None: + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + if args.clip_grad_norm: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm) + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + if args.clip_grad_norm: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm) + optimizer.step() + + scheduler.step() + + if not dist.is_initialized() or dist.get_rank() == 0: + if writer is not None and step % args.tensorboard_log_frequency == 0: + # log the loss and metrics to tensorboard + + writer.add_scalar("loss", loss, step) + for name, value in logger.meters.items(): + writer.add_scalar(name, value.avg, step) + # log the images to tensorboard + pred_grid = visualization.make_training_sample_grid( + image_left, image_right, disp_mask, valid_disp_mask, disp_predictions + ) + writer.add_image("predictions", pred_grid, step, dataformats="HWC") + + # second thing we want to see is how relevant the iterative refinement is + pred_sequence_grid = visualization.make_disparity_sequence_grid(disp_predictions, disp_mask) + writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC") + + if step % args.save_frequency == 0: + if not args.distributed or args.rank == 0: + model_without_ddp = ( + model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model + ) + checkpoint = { + "model": model_without_ddp.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "step": step, + "args": args, + } + os.makedirs(args.checkpoint_dir, exist_ok=True) + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth") + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth") + + if step % args.valid_frequency == 0: + evaluate(model, val_loaders, args, writer, step) + model.train() + if args.freeze_batch_norm: + if isinstance(model, nn.parallel.DistributedDataParallel): + freeze_batch_norm(model.module) + else: + freeze_batch_norm(model) + + # one final save at the end + if not args.distributed or args.rank == 0: + model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model + checkpoint = { + "model": model_without_ddp.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "step": step, + "args": args, + } + os.makedirs(args.checkpoint_dir, exist_ok=True) + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth") + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth") + + +def main(args): + args.total_iterations = sum(args.dataset_steps) + + # initialize DDP setting + utils.setup_ddp(args) + print(args) + + args.test_only = args.train_datasets is None + + # set the appropriate devices + if args.distributed and args.device == "cpu": + raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun") + device = torch.device(args.device) + + # select model architecture + model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights) + + # convert to DDP if need be + if args.distributed: + model = model.to(args.gpu) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + else: + model.to(device) + model_without_ddp = model + + os.makedirs(args.checkpoint_dir, exist_ok=True) + + val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets} + + # EVAL ONLY configurations + if args.test_only: + evaluate(model, val_loaders, args) + return + + # Sanity check for the parameter count + print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") + + # Compose the training dataset + train_dataset = get_train_dataset(args.dataset_root, args) + + # initialize the optimizer + if args.optimizer == "adam": + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer == "sgd": + optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9) + else: + raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd") + + # initialize the learning rate schedule + scheduler = make_lr_schedule(args, optimizer) + + # load them from checkpoint if needed + args.start_step = 0 + if args.resume_path is not None: + checkpoint = torch.load(args.resume_path, map_location="cpu", weights_only=True) + if "model" in checkpoint: + # this means the user requested to resume from a training checkpoint + model_without_ddp.load_state_dict(checkpoint["model"]) + # this means the user wants to continue training from where it was left off + if args.resume_schedule: + optimizer.load_state_dict(checkpoint["optimizer"]) + scheduler.load_state_dict(checkpoint["scheduler"]) + args.start_step = checkpoint["step"] + 1 + # modify starting point of the dat + sample_start_step = args.start_step * args.batch_size * args.world_size + train_dataset = train_dataset[sample_start_step:] + + else: + # this means the user wants to finetune on top of a model state dict + # and that no other changes are required + model_without_ddp.load_state_dict(checkpoint) + + torch.backends.cudnn.benchmark = True + + # enable training mode + model.train() + if args.freeze_batch_norm: + freeze_batch_norm(model_without_ddp) + + # put dataloader on top of the dataset + # make sure to disable shuffling since the dataset is already shuffled + # in order to guarantee quasi randomness whilst retaining a deterministic + # dataset consumption order + if args.distributed: + # the train dataset is preshuffled in order to respect the iteration order + sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True) + else: + # the train dataset is already shuffled, so we can use a simple SequentialSampler + sampler = torch.utils.data.SequentialSampler(train_dataset) + + train_loader = torch.utils.data.DataLoader( + train_dataset, + sampler=sampler, + batch_size=args.batch_size, + pin_memory=True, + num_workers=args.workers, + ) + + # initialize the logger + if args.tensorboard_summaries: + from torch.utils.tensorboard import SummaryWriter + + tensorboard_path = Path(args.checkpoint_dir) / "tensorboard" + os.makedirs(tensorboard_path, exist_ok=True) + + tensorboard_run = tensorboard_path / f"{args.name}" + writer = SummaryWriter(tensorboard_run) + else: + writer = None + + logger = utils.MetricLogger(delimiter=" ") + + scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None + # run the training loop + # this will perform optimization, respectively logging and saving checkpoints + # when need be + run( + model=model, + optimizer=optimizer, + scheduler=scheduler, + train_loader=train_loader, + val_loaders=val_loaders, + logger=logger, + writer=writer, + scaler=scaler, + args=args, + ) + + +def get_args_parser(add_help=True): + import argparse + + parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help) + # checkpointing + parser.add_argument("--name", default="crestereo", help="name of the experiment") + parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume") + parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory") + + # dataset + parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory") + parser.add_argument( + "--train-datasets", + type=str, + nargs="+", + default=["crestereo"], + help="dataset(s) to train on", + choices=list(VALID_DATASETS.keys()), + ) + parser.add_argument( + "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset" + ) + parser.add_argument( + "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs" + ) + parser.add_argument( + "--test-datasets", + type=str, + nargs="+", + default=["middlebury2014-train"], + help="dataset(s) to test on", + choices=["middlebury2014-train"], + ) + parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True) + parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True) + parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU") + parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU") + parser.add_argument( + "--threads", + type=int, + default=16, + help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.", + ) + + # model architecture + parser.add_argument( + "--model", + type=str, + default="crestereo_base", + help="model architecture", + choices=["crestereo_base", "raft_stereo"], + ) + parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates") + parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters") + + # loss parameters + parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss") + parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss") + parser.add_argument( + "--flow-loss-exclude-large", + action="store_true", + help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm", + default=False, + ) + parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight") + parser.add_argument( + "--consistency-resize-factor", + type=float, + default=0.25, + help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image", + ) + parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight") + parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight") + parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight") + parser.add_argument( + "--photometric-max-displacement-ratio", + type=float, + default=0.15, + help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss", + ) + parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight") + + # transforms parameters + parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms") + parser.add_argument( + "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation" + ) + parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size") + parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size") + parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range") + parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image") + parser.add_argument( + "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"] + ) + parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image") + parser.add_argument( + "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization" + ) + parser.add_argument( + "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization" + ) + parser.add_argument( + "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False + ) + parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity") + parser.add_argument( + "--interpolation-strategy", + type=str, + default="bilinear", + help="interpolation strategy", + choices=["bilinear", "bicubic", "mixed"], + ) + parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image") + parser.add_argument( + "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift" + ) + parser.add_argument( + "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift" + ) + parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction") + parser.add_argument( + "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction" + ) + parser.add_argument( + "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction" + ) + parser.add_argument( + "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction" + ) + parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction") + parser.add_argument( + "--asymmetric-jitter-prob", + type=float, + default=1.0, + help="probability of using asymmetric jitter instead of symmetric jitter", + ) + parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage") + parser.add_argument( + "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels" + ) + parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images") + parser.add_argument( + "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels" + ) + parser.add_argument( + "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation" + ) + + # optimizer parameters + parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"]) + parser.add_argument("--lr", type=float, default=4e-4, help="learning rate") + parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay") + parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm") + + # lr_scheduler parameters + parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate") + parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps") + parser.add_argument( + "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr" + ) + parser.add_argument( + "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"] + ) + parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate") + parser.add_argument( + "--lr-decay-method", + type=str, + default="linear", + help="decay method", + choices=["linear", "cosine", "exponential"], + ) + parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate") + + # deterministic behaviour + parser.add_argument("--seed", type=int, default=42, help="seed for random number generators") + + # mixed precision training + parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training") + + # logging + parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard") + parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency") + parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency") + parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency") + parser.add_argument( + "--metrics", + type=str, + nargs="+", + default=["mae", "rmse", "1px", "3px", "5px", "relepe"], + help="metrics to log", + choices=AVAILABLE_METRICS, + ) + + # distributed parameters + parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes") + parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training") + parser.add_argument("--device", type=str, default="cuda", help="device to use for training") + + # weights API + parser.add_argument("--weights", type=str, default=None, help="weights API url") + parser.add_argument( + "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning" + ) + parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state") + + # padder parameters + parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"]) + return parser + + +if __name__ == "__main__": + args = get_args_parser().parse_args() + main(args) diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py new file mode 100644 index 00000000000..9c4a6bab6d3 --- /dev/null +++ b/references/depth/stereo/transforms.py @@ -0,0 +1,650 @@ +import random +from typing import Callable, List, Optional, Sequence, Tuple, Union + +import numpy as np +import PIL.Image +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F +from torch import Tensor + +T_FLOW = Union[Tensor, np.ndarray, None] +T_MASK = Union[Tensor, np.ndarray, None] +T_STEREO_TENSOR = Tuple[Tensor, Tensor] +T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]] + + +def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor: + return (low - high) * torch.rand(size) + high + + +class InterpolationStrategy: + + _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"] + + def __init__(self, mode: str = "mixed") -> None: + if mode not in self._valid_modes: + raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}") + + if mode == "mixed": + self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC] + elif mode == "bicubic": + self.strategies = [F.InterpolationMode.BICUBIC] + elif mode == "bilinear": + self.strategies = [F.InterpolationMode.BILINEAR] + + def __call__(self) -> F.InterpolationMode: + return random.choice(self.strategies) + + @classmethod + def is_valid(mode: str) -> bool: + return mode in InterpolationStrategy._valid_modes + + @property + def valid_modes() -> List[str]: + return InterpolationStrategy._valid_modes + + +class ValidateModelInput(torch.nn.Module): + # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects + def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK): + if images[0].shape != images[1].shape: + raise ValueError("img1 and img2 should have the same shape.") + h, w = images[0].shape[-2:] + if disparities[0] is not None and disparities[0].shape != (1, h, w): + raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}") + if masks[0] is not None: + if masks[0].shape != (h, w): + raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}") + if masks[0].dtype != torch.bool: + raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}") + + return images, disparities, masks + + +class ConvertToGrayscale(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward( + self, + images: Tuple[PIL.Image.Image, PIL.Image.Image], + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + img_left = F.rgb_to_grayscale(images[0], num_output_channels=3) + img_right = F.rgb_to_grayscale(images[1], num_output_channels=3) + + return (img_left, img_right), disparities, masks + + +class MakeValidDisparityMask(torch.nn.Module): + def __init__(self, max_disparity: Optional[int] = 256) -> None: + super().__init__() + self.max_disparity = max_disparity + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + valid_masks = tuple( + torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask + for idx, mask in enumerate(masks) + ) + + valid_masks = tuple( + torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask + for mask, disparity in zip(valid_masks, disparities) + ) + + if self.max_disparity is not None: + valid_masks = tuple( + torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask + for mask, disparity in zip(valid_masks, disparities) + ) + + return images, disparities, valid_masks + + +class ToGPU(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + dev_images = tuple(image.cuda() for image in images) + dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities)) + dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks)) + return dev_images, dev_disparities, dev_masks + + +class ConvertImageDtype(torch.nn.Module): + def __init__(self, dtype: torch.dtype): + super().__init__() + self.dtype = dtype + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + img_left = F.convert_image_dtype(images[0], dtype=self.dtype) + img_right = F.convert_image_dtype(images[1], dtype=self.dtype) + + img_left = img_left.contiguous() + img_right = img_right.contiguous() + + return (img_left, img_right), disparities, masks + + +class Normalize(torch.nn.Module): + def __init__(self, mean: List[float], std: List[float]) -> None: + super().__init__() + self.mean = mean + self.std = std + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + img_left = F.normalize(images[0], mean=self.mean, std=self.std) + img_right = F.normalize(images[1], mean=self.mean, std=self.std) + + img_left = img_left.contiguous() + img_right = img_right.contiguous() + + return (img_left, img_right), disparities, masks + + +class ToTensor(torch.nn.Module): + def forward( + self, + images: Tuple[PIL.Image.Image, PIL.Image.Image], + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + if images[0] is None: + raise ValueError("img_left is None") + if images[1] is None: + raise ValueError("img_right is None") + + img_left = F.pil_to_tensor(images[0]) + img_right = F.pil_to_tensor(images[1]) + disparity_tensors = () + mask_tensors = () + + for idx in range(2): + disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,) + mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,) + + return (img_left, img_right), disparity_tensors, mask_tensors + + +class AsymmetricColorJitter(T.ColorJitter): + # p determines the probability of doing asymmetric vs symmetric color jittering + def __init__( + self, + brightness: T_COLOR_AUG_PARAM = 0, + contrast: T_COLOR_AUG_PARAM = 0, + saturation: T_COLOR_AUG_PARAM = 0, + hue: T_COLOR_AUG_PARAM = 0, + p: float = 0.2, + ): + super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue) + self.p = p + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + if torch.rand(1) < self.p: + # asymmetric: different transform for img1 and img2 + img_left = super().forward(images[0]) + img_right = super().forward(images[1]) + else: + # symmetric: same transform for img1 and img2 + batch = torch.stack(images) + batch = super().forward(batch) + img_left, img_right = batch[0], batch[1] + + return (img_left, img_right), disparities, masks + + +class AsymetricGammaAdjust(torch.nn.Module): + def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None: + super().__init__() + self.gamma_range = gamma_range + self.gain = gain + self.p = p + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item() + + if torch.rand(1) < self.p: + # asymmetric: different transform for img1 and img2 + img_left = F.adjust_gamma(images[0], gamma, gain=self.gain) + img_right = F.adjust_gamma(images[1], gamma, gain=self.gain) + else: + # symmetric: same transform for img1 and img2 + batch = torch.stack(images) + batch = F.adjust_gamma(batch, gamma, gain=self.gain) + img_left, img_right = batch[0], batch[1] + + return (img_left, img_right), disparities, masks + + +class RandomErase(torch.nn.Module): + # Produces multiple symmetric random erasures + # these can be viewed as occlusions present in both camera views. + # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map + def __init__( + self, + p: float = 0.5, + erase_px_range: Tuple[int, int] = (50, 100), + value: Union[Tensor, float] = 0, + inplace: bool = False, + max_erase: int = 2, + ): + super().__init__() + self.min_px_erase = erase_px_range[0] + self.max_px_erase = erase_px_range[1] + if self.max_px_erase < 0: + raise ValueError("erase_px_range[1] should be equal or greater than 0") + if self.min_px_erase < 0: + raise ValueError("erase_px_range[0] should be equal or greater than 0") + if self.min_px_erase > self.max_px_erase: + raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]") + + self.p = p + self.value = value + self.inplace = inplace + self.max_erase = max_erase + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: T_STEREO_TENSOR, + masks: T_STEREO_TENSOR, + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + if torch.rand(1) < self.p: + return images, disparities, masks + + image_left, image_right = images + mask_left, mask_right = masks + for _ in range(torch.randint(self.max_erase, size=(1,)).item()): + y, x, h, w, v = self._get_params(image_left) + image_right = F.erase(image_right, y, x, h, w, v, self.inplace) + image_left = F.erase(image_left, y, x, h, w, v, self.inplace) + # similarly to optical flow occlusion prediction, we consider + # any erasure pixels that are in both images to be occluded therefore + # we mark them as invalid + if mask_left is not None: + mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace) + if mask_right is not None: + mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace) + + return (image_left, image_right), disparities, (mask_left, mask_right) + + def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]: + img_h, img_w = img.shape[-2:] + crop_h, crop_w = ( + random.randint(self.min_px_erase, self.max_px_erase), + random.randint(self.min_px_erase, self.max_px_erase), + ) + crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h)) + + return crop_y, crop_x, crop_h, crop_w, self.value + + +class RandomOcclusion(torch.nn.Module): + # This adds an occlusion in the right image + # the occluded patch works as a patch erase where the erase value is the mean + # of the pixels from the selected zone + def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False): + super().__init__() + + self.min_px_occlusion = occlusion_px_range[0] + self.max_px_occlusion = occlusion_px_range[1] + + if self.max_px_occlusion < 0: + raise ValueError("occlusion_px_range[1] should be greater or equal than 0") + if self.min_px_occlusion < 0: + raise ValueError("occlusion_px_range[0] should be greater or equal than 0") + if self.min_px_occlusion > self.max_px_occlusion: + raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]") + + self.p = p + self.inplace = inplace + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: T_STEREO_TENSOR, + masks: T_STEREO_TENSOR, + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + left_image, right_image = images + + if torch.rand(1) < self.p: + return images, disparities, masks + + y, x, h, w, v = self._get_params(right_image) + right_image = F.erase(right_image, y, x, h, w, v, self.inplace) + + return ((left_image, right_image), disparities, masks) + + def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]: + img_h, img_w = img.shape[-2:] + crop_h, crop_w = ( + random.randint(self.min_px_occlusion, self.max_px_occlusion), + random.randint(self.min_px_occlusion, self.max_px_occlusion), + ) + + crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h)) + occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True) + + return (crop_y, crop_x, crop_h, crop_w, occlusion_value) + + +class RandomSpatialShift(torch.nn.Module): + # This transform applies a vertical shift and a slight angle rotation and the same time + def __init__( + self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear" + ) -> None: + super().__init__() + self.p = p + self.max_angle = max_angle + self.max_px_shift = max_px_shift + self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type) + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: T_STEREO_TENSOR, + masks: T_STEREO_TENSOR, + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + # the transform is applied only on the right image + # in order to mimic slight calibration issues + img_left, img_right = images + + INTERP_MODE = self._interpolation_mode_strategy() + + if torch.rand(1) < self.p: + # [0, 1] -> [-a, a] + shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item() + angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item() + # sample center point for the rotation matrix + y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item() + x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item() + # apply affine transformations + img_right = F.affine( + img_right, + angle=angle, + translate=[0, shift], # translation only on the y-axis + center=[x, y], + scale=1.0, + shear=0.0, + interpolation=INTERP_MODE, + ) + + return ((img_left, img_right), disparities, masks) + + +class RandomHorizontalFlip(torch.nn.Module): + def __init__(self, p: float = 0.5) -> None: + super().__init__() + self.p = p + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + img_left, img_right = images + dsp_left, dsp_right = disparities + mask_left, mask_right = masks + + if dsp_right is not None and torch.rand(1) < self.p: + img_left, img_right = F.hflip(img_left), F.hflip(img_right) + dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right) + if mask_left is not None and mask_right is not None: + mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right) + return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left)) + + return images, disparities, masks + + +class Resize(torch.nn.Module): + def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None: + super().__init__() + self.resize_size = list(resize_size) # doing this to keep mypy happy + self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type) + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + resized_images = () + resized_disparities = () + resized_masks = () + + INTERP_MODE = self._interpolation_mode_strategy() + + for img in images: + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the stereo models with antialias=True? + resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE, antialias=False),) + + for dsp in disparities: + if dsp is not None: + # rescale disparity to match the new image size + scale_x = self.resize_size[1] / dsp.shape[-1] + resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,) + else: + resized_disparities += (None,) + + for mask in masks: + if mask is not None: + resized_masks += ( + # we squeeze and unsqueeze because the API requires > 3D tensors + F.resize( + mask.unsqueeze(0), + self.resize_size, + interpolation=F.InterpolationMode.NEAREST, + ).squeeze(0), + ) + else: + resized_masks += (None,) + + return resized_images, resized_disparities, resized_masks + + +class RandomRescaleAndCrop(torch.nn.Module): + # This transform will resize the input with a given proba, and then crop it. + # These are the reversed operations of the built-in RandomResizedCrop, + # although the order of the operations doesn't matter too much: resizing a + # crop would give the same result as cropping a resized image, up to + # interpolation artifact at the borders of the output. + # + # The reason we don't rely on RandomResizedCrop is because of a significant + # difference in the parametrization of both transforms, in particular, + # because of the way the random parameters are sampled in both transforms, + # which leads to fairly different results (and different epe). For more details see + # https://github.com/pytorch/vision/pull/5026/files#r762932579 + def __init__( + self, + crop_size: Tuple[int, int], + scale_range: Tuple[float, float] = (-0.2, 0.5), + rescale_prob: float = 0.8, + scaling_type: str = "exponential", + interpolation_type: str = "bilinear", + ) -> None: + super().__init__() + self.crop_size = crop_size + self.min_scale = scale_range[0] + self.max_scale = scale_range[1] + self.rescale_prob = rescale_prob + self.scaling_type = scaling_type + self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type) + + if self.scaling_type == "linear" and self.min_scale < 0: + raise ValueError("min_scale must be >= 0 for linear scaling") + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + img_left, img_right = images + dsp_left, dsp_right = disparities + mask_left, mask_right = masks + INTERP_MODE = self._interpolation_mode_strategy() + + # randomly sample scale + h, w = img_left.shape[-2:] + # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti) + # It shouldn't matter much + min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w) + + # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise + # 2 to the power of that random value. This final scale distribution will have a different + # mean and variance than a uniform distribution. Note that a scale of 1 will result in + # a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling + # of 0.5X the original size. + if self.scaling_type == "exponential": + scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item() + # linear scaling will draw a random scale in (min_scale, max_scale) + elif self.scaling_type == "linear": + scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item() + + scale = max(scale, min_scale) + + new_h, new_w = round(h * scale), round(w * scale) + + if torch.rand(1).item() < self.rescale_prob: + # rescale the images + img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE) + img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE) + + resized_masks, resized_disparities = (), () + + for disparity, mask in zip(disparities, masks): + if disparity is not None: + if mask is None: + resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE) + # rescale the disparity + resized_disparity = ( + resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None] + ) + resized_mask = None + else: + resized_disparity, resized_mask = _resize_sparse_flow( + disparity, mask, scale_x=scale, scale_y=scale + ) + resized_masks += (resized_mask,) + resized_disparities += (resized_disparity,) + + else: + resized_disparities = disparities + resized_masks = masks + + disparities = resized_disparities + masks = resized_masks + + # Note: For sparse datasets (Kitti), the original code uses a "margin" + # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220 + # We don't, not sure if it matters much + y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item() + x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item() + + img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1]) + img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1]) + if dsp_left is not None: + dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1]) + if dsp_right is not None: + dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1]) + + cropped_masks = () + for mask in masks: + if mask is not None: + mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1]) + cropped_masks += (mask,) + + return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks) + + +def _resize_sparse_flow( + flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0 +) -> Tuple[Tensor, Tensor]: + # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse) + # There are as-many non-zero values in the original flow as in the resized flow (up to OOB) + # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4 + + h, w = flow.shape[-2:] + + h_new = int(round(h * scale_y)) + w_new = int(round(w * scale_x)) + flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype) + valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype) + + jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy") + + ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask] + + ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long) + jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long) + + within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new) + + ii_valid = ii_valid[within_bounds_mask] + jj_valid = jj_valid[within_bounds_mask] + ii_valid_new = ii_valid_new[within_bounds_mask] + jj_valid_new = jj_valid_new[within_bounds_mask] + + valid_flow_new = flow[:, ii_valid, jj_valid] + valid_flow_new *= scale_x + + flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new + valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid] + + return flow_new, valid_new.bool() + + +class Compose(torch.nn.Module): + def __init__(self, transforms: List[Callable]): + super().__init__() + self.transforms = transforms + + @torch.inference_mode() + def forward(self, images, disparities, masks): + for t in self.transforms: + images, disparities, masks = t(images, disparities, masks) + return images, disparities, masks diff --git a/references/depth/stereo/utils/__init__.py b/references/depth/stereo/utils/__init__.py new file mode 100644 index 00000000000..4dacbe61ba0 --- /dev/null +++ b/references/depth/stereo/utils/__init__.py @@ -0,0 +1,6 @@ +from .losses import * +from .metrics import * +from .distributed import * +from .logger import * +from .padder import * +from .norm import * diff --git a/references/depth/stereo/utils/distributed.py b/references/depth/stereo/utils/distributed.py new file mode 100644 index 00000000000..228aa2a0f9a --- /dev/null +++ b/references/depth/stereo/utils/distributed.py @@ -0,0 +1,60 @@ +import os + +import torch +import torch.distributed as dist + + +def _redefine_print(is_main): + """disables printing when not in main process""" + import builtins as __builtin__ + + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + if is_main or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def setup_ddp(args): + # Set the local_rank, rank, and world_size values as args fields + # This is done differently depending on how we're running the script. We + # currently support either torchrun or the custom run_with_submitit.py + # If you're confused (like I was), this might help a bit + # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2 + + if "RANK" in os.environ and "WORLD_SIZE" in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ["WORLD_SIZE"]) + args.gpu = int(os.environ["LOCAL_RANK"]) + elif "SLURM_PROCID" in os.environ: + args.rank = int(os.environ["SLURM_PROCID"]) + args.gpu = args.rank % torch.cuda.device_count() + elif hasattr(args, "rank"): + pass + else: + print("Not using distributed mode") + args.distributed = False + args.world_size = 1 + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + dist.init_process_group( + backend="nccl", + rank=args.rank, + world_size=args.world_size, + init_method=args.dist_url, + ) + torch.distributed.barrier() + _redefine_print(is_main=(args.rank == 0)) + + +def reduce_across_processes(val): + t = torch.tensor(val, device="cuda") + dist.barrier() + dist.all_reduce(t) + return t diff --git a/references/depth/stereo/utils/logger.py b/references/depth/stereo/utils/logger.py new file mode 100644 index 00000000000..803e9aebd7b --- /dev/null +++ b/references/depth/stereo/utils/logger.py @@ -0,0 +1,153 @@ +import datetime +import time +from collections import defaultdict, deque + +import torch + +from .distributed import reduce_across_processes + + +class SmoothedValue: + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt="{median:.4f} ({global_avg:.4f})"): + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + t = reduce_across_processes([self.count, self.total]) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value + ) + + +class MetricLogger: + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + if not isinstance(v, (float, int)): + raise TypeError( + f"This method expects the value of the input arguments to be of type float or int, instead got {type(v)}" + ) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'") + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append(f"{name}: {str(meter)}") + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, **kwargs): + self.meters[name] = SmoothedValue(**kwargs) + + def log_every(self, iterable, print_freq=5, header=None): + i = 0 + if not header: + header = "" + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt="{avg:.4f}") + data_time = SmoothedValue(fmt="{avg:.4f}") + space_fmt = ":" + str(len(str(len(iterable)))) + "d" + if torch.cuda.is_available(): + log_msg = self.delimiter.join( + [ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + "max mem: {memory:.0f}", + ] + ) + else: + log_msg = self.delimiter.join( + [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"] + ) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if print_freq is not None and i % print_freq == 0: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print( + log_msg.format( + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB, + ) + ) + else: + print( + log_msg.format( + i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time) + ) + ) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print(f"{header} Total time: {total_time_str}") diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py new file mode 100644 index 00000000000..1c21353a056 --- /dev/null +++ b/references/depth/stereo/utils/losses.py @@ -0,0 +1,503 @@ +from typing import List, Optional + +import torch +from torch import nn, Tensor +from torch.nn import functional as F +from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid + + +def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor: + """Function to create a 2D Gaussian kernel.""" + + x = torch.arange(kernel_size, dtype=torch.float32) + y = torch.arange(kernel_size, dtype=torch.float32) + x = x - (kernel_size - 1) / 2 + y = y - (kernel_size - 1) / 2 + x, y = torch.meshgrid(x, y, indexing="ij") + grid = (x**2 + y**2) / (2 * sigma**2) + kernel = torch.exp(-grid) + kernel = kernel / kernel.sum() + return kernel + + +def _sequence_loss_fn( + flow_preds: List[Tensor], + flow_gt: Tensor, + valid_flow_mask: Optional[Tensor], + gamma: Tensor, + max_flow: int = 256, + exclude_large: bool = False, + weights: Optional[Tensor] = None, +): + """Loss function defined over sequence of flow predictions""" + torch._assert( + gamma < 1, + "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma), + ) + + if exclude_large: + # exclude invalid pixels and extremely large diplacements + flow_norm = torch.sum(flow_gt**2, dim=1).sqrt() + if valid_flow_mask is not None: + valid_flow_mask = valid_flow_mask & (flow_norm < max_flow) + else: + valid_flow_mask = flow_norm < max_flow + + if valid_flow_mask is not None: + valid_flow_mask = valid_flow_mask.unsqueeze(1) + flow_preds = torch.stack(flow_preds) # shape = (num_flow_updates, batch_size, 2, H, W) + + abs_diff = (flow_preds - flow_gt).abs() + if valid_flow_mask is not None: + abs_diff = abs_diff * valid_flow_mask.unsqueeze(0) + + abs_diff = abs_diff.mean(axis=(1, 2, 3, 4)) + num_predictions = flow_preds.shape[0] + + # allocating on CPU and moving to device during run-time can force + # an unwanted GPU synchronization that produces a large overhead + if weights is None or len(weights) != num_predictions: + weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype) + + flow_loss = (abs_diff * weights).sum() + return flow_loss, weights + + +class SequenceLoss(nn.Module): + def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None: + """ + Args: + gamma: value for the exponential weighting of the loss across frames + max_flow: maximum flow value to exclude + exclude_large_flows: whether to exclude large flows + """ + + super().__init__() + self.max_flow = max_flow + self.excluding_large = exclude_large_flows + self.register_buffer("gamma", torch.tensor([gamma])) + # cache the scale factor for the loss + self._weights = None + + def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor: + """ + Args: + flow_preds: list of flow predictions of shape (batch_size, C, H, W) + flow_gt: ground truth flow of shape (batch_size, C, H, W) + valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W) + """ + loss, weights = _sequence_loss_fn( + flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights + ) + self._weights = weights + return loss + + def set_gamma(self, gamma: float) -> None: + self.gamma.fill_(gamma) + # reset the cached scale factor + self._weights = None + + +def _ssim_loss_fn( + source: Tensor, + reference: Tensor, + kernel: Tensor, + eps: float = 1e-8, + c1: float = 0.01**2, + c2: float = 0.03**2, + use_padding: bool = False, +) -> Tensor: + # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity + # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim + + torch._assert( + source.ndim == reference.ndim == 4, + "SSIM: `source` and `reference` must be 4-dimensional tensors", + ) + + torch._assert( + source.shape == reference.shape, + "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format( + source.shape, reference.shape + ), + ) + + B, C, H, W = source.shape + kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1) + if use_padding: + pad_size = kernel.shape[2] // 2 + source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect") + reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect") + + mu1 = F.conv2d(source, kernel, groups=C) + mu2 = F.conv2d(reference, kernel, groups=C) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + + mu1_mu2 = mu1 * mu2 + mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C) + mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C) + mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C) + + sigma1_sq = mu_img1_sq - mu1_sq + sigma2_sq = mu_img2_sq - mu2_sq + sigma12 = mu_img1_mu2 - mu1_mu2 + + numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2) + denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) + ssim = numerator / (denominator + eps) + + # doing 1 - ssim because we want to maximize the ssim + return 1 - ssim.mean(dim=(1, 2, 3)) + + +class SSIM(nn.Module): + def __init__( + self, + kernel_size: int = 11, + max_val: float = 1.0, + sigma: float = 1.5, + eps: float = 1e-12, + use_padding: bool = True, + ) -> None: + """SSIM loss function. + + Args: + kernel_size: size of the Gaussian kernel + max_val: constant scaling factor + sigma: sigma of the Gaussian kernel + eps: constant for division by zero + use_padding: whether to pad the input tensor such that we have a score for each pixel + """ + super().__init__() + + self.kernel_size = kernel_size + self.max_val = max_val + self.sigma = sigma + + gaussian_kernel = make_gaussian_kernel(kernel_size, sigma) + self.register_buffer("gaussian_kernel", gaussian_kernel) + + self.c1 = (0.01 * self.max_val) ** 2 + self.c2 = (0.03 * self.max_val) ** 2 + + self.use_padding = use_padding + self.eps = eps + + def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor: + """ + Args: + source: source image of shape (batch_size, C, H, W) + reference: reference image of shape (batch_size, C, H, W) + + Returns: + SSIM loss of shape (batch_size,) + """ + return _ssim_loss_fn( + source, + reference, + kernel=self.gaussian_kernel, + c1=self.c1, + c2=self.c2, + use_padding=self.use_padding, + eps=self.eps, + ) + + +def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor): + # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202 + + torch._assert( + img_gx.ndim >= 3, + "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)", + ) + + torch._assert( + img_gx.ndim == val_gx.ndim, + "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format( + img_gx.ndim, val_gx.ndim + ), + ) + + for idx in range(img_gx.ndim): + torch._assert( + (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)), + "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format( + img_gx.shape, val_gx.shape + ), + ) + + # -3 is channel dimension + weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True)) + weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True)) + + smoothness_x = img_gx * weights_x + smoothness_y = img_gy * weights_y + + smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1)) + return smoothness + + +class SmoothnessLoss(nn.Module): + def __init__(self) -> None: + super().__init__() + + def _x_gradient(self, img: Tensor) -> Tensor: + if img.ndim > 4: + original_shape = img.shape + is_reshaped = True + img = img.reshape(-1, *original_shape[-3:]) + else: + is_reshaped = False + + padded = F.pad(img, (0, 1, 0, 0), mode="replicate") + grad = padded[..., :, :-1] - padded[..., :, 1:] + if is_reshaped: + grad = grad.reshape(original_shape) + return grad + + def _y_gradient(self, x: torch.Tensor) -> torch.Tensor: + if x.ndim > 4: + original_shape = x.shape + is_reshaped = True + x = x.reshape(-1, *original_shape[-3:]) + else: + is_reshaped = False + + padded = F.pad(x, (0, 0, 0, 1), mode="replicate") + grad = padded[..., :-1, :] - padded[..., 1:, :] + if is_reshaped: + grad = grad.reshape(original_shape) + return grad + + def forward(self, images: Tensor, vals: Tensor) -> Tensor: + """ + Args: + images: tensor of shape (D1, D2, ..., DN, C, H, W) + vals: tensor of shape (D1, D2, ..., DN, 1, H, W) + + Returns: + smoothness loss of shape (D1, D2, ..., DN) + """ + img_gx = self._x_gradient(images) + img_gy = self._y_gradient(images) + + val_gx = self._x_gradient(vals) + val_gy = self._y_gradient(vals) + + return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy) + + +def _flow_sequence_consistency_loss_fn( + flow_preds: List[Tensor], + gamma: float = 0.8, + resize_factor: float = 0.25, + rescale_factor: float = 0.25, + rescale_mode: str = "bilinear", + weights: Optional[Tensor] = None, +): + """Loss function defined over sequence of flow predictions""" + + # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf + # In the original paper, an additional refinement network is used to refine a flow prediction. + # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update. + # which should be consistent with the previous step. In this implementation, we simplify the overall loss + # term and ignore left-right consistency loss or photometric loss which can be treated separately. + + torch._assert( + rescale_factor <= 1.0, + "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format( + rescale_factor + ), + ) + + flow_preds = torch.stack(flow_preds) # shape = (num_flow_updates, batch_size, 2, H, W) + N, B, C, H, W = flow_preds.shape + + # rescale flow predictions to account for bilinear upsampling artifacts + if rescale_factor: + flow_preds = ( + F.interpolate( + flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True + ) + ) * rescale_factor + flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0) + + # force the next prediction to be similar to the previous prediction + abs_diff = (flow_preds[1:] - flow_preds[:-1]).square() + abs_diff = abs_diff.mean(axis=(1, 2, 3, 4)) + + num_predictions = flow_preds.shape[0] - 1 # because we are comparing differences + if weights is None or len(weights) != num_predictions: + weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype) + + flow_loss = (abs_diff * weights).sum() + return flow_loss, weights + + +class FlowSequenceConsistencyLoss(nn.Module): + def __init__( + self, + gamma: float = 0.8, + resize_factor: float = 0.25, + rescale_factor: float = 0.25, + rescale_mode: str = "bilinear", + ) -> None: + super().__init__() + self.gamma = gamma + self.resize_factor = resize_factor + self.rescale_factor = rescale_factor + self.rescale_mode = rescale_mode + self._weights = None + + def forward(self, flow_preds: List[Tensor]) -> Tensor: + """ + Args: + flow_preds: list of tensors of shape (batch_size, C, H, W) + + Returns: + sequence consistency loss of shape (batch_size,) + """ + loss, weights = _flow_sequence_consistency_loss_fn( + flow_preds, + gamma=self.gamma, + resize_factor=self.resize_factor, + rescale_factor=self.rescale_factor, + rescale_mode=self.rescale_mode, + weights=self._weights, + ) + self._weights = weights + return loss + + def set_gamma(self, gamma: float) -> None: + self.gamma.fill_(gamma) + # reset the cached scale factor + self._weights = None + + +def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor: + torch._assert( + source.shape == target.shape, + "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape), + ) + + # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio + return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1)))) + + +class PSNRLoss(nn.Module): + def __init__(self, max_val: float = 256) -> None: + """ + Args: + max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor. + + """ + super().__init__() + self.max_val = max_val + + def forward(self, source: Tensor, target: Tensor) -> Tensor: + """ + Args: + source: tensor of shape (D1, D2, ..., DN, C, H, W) + target: tensor of shape (D1, D2, ..., DN, C, H, W) + + Returns: + psnr loss of shape (D1, D2, ..., DN) + """ + + # multiply by -1 as we want to maximize the psnr + return -1 * _psnr_loss_fn(source, target, self.max_val) + + +class FlowPhotoMetricLoss(nn.Module): + def __init__( + self, + ssim_weight: float = 0.85, + ssim_window_size: int = 11, + ssim_max_val: float = 1.0, + ssim_sigma: float = 1.5, + ssim_eps: float = 1e-12, + ssim_use_padding: bool = True, + max_displacement_ratio: float = 0.15, + ) -> None: + super().__init__() + + self._ssim_loss = SSIM( + kernel_size=ssim_window_size, + max_val=ssim_max_val, + sigma=ssim_sigma, + eps=ssim_eps, + use_padding=ssim_use_padding, + ) + + self._L1_weight = 1 - ssim_weight + self._SSIM_weight = ssim_weight + self._max_displacement_ratio = max_displacement_ratio + + def forward( + self, + source: Tensor, + reference: Tensor, + flow_pred: Tensor, + valid_mask: Optional[Tensor] = None, + ): + """ + Args: + source: tensor of shape (B, C, H, W) + reference: tensor of shape (B, C, H, W) + flow_pred: tensor of shape (B, 2, H, W) + valid_mask: tensor of shape (B, H, W) or None + + Returns: + photometric loss of shape + + """ + torch._assert( + source.ndim == 4, + "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim), + ) + torch._assert( + reference.ndim == source.ndim, + "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format( + source.ndim, reference.ndim + ), + ) + torch._assert( + flow_pred.shape[1] == 2, + "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]), + ) + torch._assert( + flow_pred.ndim == 4, + "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim), + ) + + B, C, H, W = source.shape + flow_channels = flow_pred.shape[1] + + max_displacements = [] + for dim in range(flow_channels): + shape_index = -1 - dim + max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index])) + + # mask out all pixels that have larger flow than the max flow allowed + max_flow_mask = torch.logical_and( + *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)] + ) + + if valid_mask is not None: + valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1) + else: + valid_mask = max_flow_mask.unsqueeze(1) + + grid = make_coords_grid(B, H, W, device=str(source.device)) + resampled_grids = grid - flow_pred + resampled_grids = resampled_grids.permute(0, 2, 3, 1) + resampled_source = grid_sample(reference, resampled_grids, mode="bilinear") + + # compute SSIM loss + ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask) + l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1)) + loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss + + return loss.mean() diff --git a/references/depth/stereo/utils/metrics.py b/references/depth/stereo/utils/metrics.py new file mode 100644 index 00000000000..05b149fb048 --- /dev/null +++ b/references/depth/stereo/utils/metrics.py @@ -0,0 +1,49 @@ +from typing import Dict, List, Optional, Tuple + +from torch import Tensor + +AVAILABLE_METRICS = ["mae", "rmse", "epe", "bad1", "bad2", "epe", "1px", "3px", "5px", "fl-all", "relepe"] + + +def compute_metrics( + flow_pred: Tensor, flow_gt: Tensor, valid_flow_mask: Optional[Tensor], metrics: List[str] +) -> Tuple[Dict[str, float], int]: + for m in metrics: + if m not in AVAILABLE_METRICS: + raise ValueError(f"Invalid metric: {m}. Valid metrics are: {AVAILABLE_METRICS}") + + metrics_dict = {} + + pixels_diffs = (flow_pred - flow_gt).abs() + # there is no Y flow in Stereo Matching, therefore flow.abs() = flow.pow(2).sum(dim=1).sqrt() + flow_norm = flow_gt.abs() + + if valid_flow_mask is not None: + valid_flow_mask = valid_flow_mask.unsqueeze(1) + pixels_diffs = pixels_diffs[valid_flow_mask] + flow_norm = flow_norm[valid_flow_mask] + + num_pixels = pixels_diffs.numel() + if "bad1" in metrics: + metrics_dict["bad1"] = (pixels_diffs > 1).float().mean().item() + if "bad2" in metrics: + metrics_dict["bad2"] = (pixels_diffs > 2).float().mean().item() + + if "mae" in metrics: + metrics_dict["mae"] = pixels_diffs.mean().item() + if "rmse" in metrics: + metrics_dict["rmse"] = pixels_diffs.pow(2).mean().sqrt().item() + if "epe" in metrics: + metrics_dict["epe"] = pixels_diffs.mean().item() + if "1px" in metrics: + metrics_dict["1px"] = (pixels_diffs < 1).float().mean().item() + if "3px" in metrics: + metrics_dict["3px"] = (pixels_diffs < 3).float().mean().item() + if "5px" in metrics: + metrics_dict["5px"] = (pixels_diffs < 5).float().mean().item() + if "fl-all" in metrics: + metrics_dict["fl-all"] = ((pixels_diffs < 3) & ((pixels_diffs / flow_norm) < 0.05)).float().mean().item() * 100 + if "relepe" in metrics: + metrics_dict["relepe"] = (pixels_diffs / flow_norm).mean().item() + + return metrics_dict, num_pixels diff --git a/references/depth/stereo/utils/norm.py b/references/depth/stereo/utils/norm.py new file mode 100644 index 00000000000..7f6e0011160 --- /dev/null +++ b/references/depth/stereo/utils/norm.py @@ -0,0 +1,13 @@ +import torch + + +def freeze_batch_norm(model): + for m in model.modules(): + if isinstance(m, torch.nn.BatchNorm2d): + m.eval() + + +def unfreeze_batch_norm(model): + for m in model.modules(): + if isinstance(m, torch.nn.BatchNorm2d): + m.train() diff --git a/references/depth/stereo/utils/padder.py b/references/depth/stereo/utils/padder.py new file mode 100644 index 00000000000..7d2c63afba6 --- /dev/null +++ b/references/depth/stereo/utils/padder.py @@ -0,0 +1,28 @@ +import torch.nn.functional as F + + +class InputPadder: + """Pads images such that dimensions are divisible by 8""" + + # TODO: Ideally, this should be part of the eval transforms preset, instead + # of being part of the validation code. It's not obvious what a good + # solution would be, because we need to unpad the predicted flows according + # to the input images' size, and in some datasets (Kitti) images can have + # variable sizes. + + def __init__(self, dims, mode="sintel"): + self.ht, self.wd = dims[-2:] + pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 + pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 + if mode == "sintel": + self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, pad_ht - pad_ht // 2] + else: + self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht] + + def pad(self, *inputs): + return [F.pad(x, self._pad, mode="replicate") for x in inputs] + + def unpad(self, x): + ht, wd = x.shape[-2:] + c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] + return x[..., c[0] : c[1], c[2] : c[3]] diff --git a/references/depth/stereo/visualization.py b/references/depth/stereo/visualization.py new file mode 100644 index 00000000000..07a7e7167d3 --- /dev/null +++ b/references/depth/stereo/visualization.py @@ -0,0 +1,127 @@ +import os +from typing import List + +import numpy as np +import numpy.typing as npt +import torch +from torch import Tensor +from torchvision.utils import make_grid + + +@torch.no_grad() +def make_disparity_image(disparity: Tensor): + # normalize image to [0, 1] + disparity = disparity.detach().cpu() + disparity = (disparity - disparity.min()) / (disparity.max() - disparity.min()) + return disparity + + +@torch.no_grad() +def make_disparity_image_pairs(disparity: Tensor, image: Tensor): + disparity = make_disparity_image(disparity) + # image is in [-1, 1], bring it to [0, 1] + image = image.detach().cpu() + image = image * 0.5 + 0.5 + return disparity, image + + +@torch.no_grad() +def make_disparity_sequence(disparities: List[Tensor]): + # convert each disparity to [0, 1] + for idx, disparity_batch in enumerate(disparities): + disparities[idx] = torch.stack(list(map(make_disparity_image, disparity_batch))) + # make the list into a batch + disparity_sequences = torch.stack(disparities) + return disparity_sequences + + +@torch.no_grad() +def make_pair_grid(*inputs, orientation="horizontal"): + # make a grid of images with the outputs and references side by side + if orientation == "horizontal": + # interleave the outputs and references + canvas = torch.zeros_like(inputs[0]) + canvas = torch.cat([canvas] * len(inputs), dim=0) + size = len(inputs) + for idx, inp in enumerate(inputs): + canvas[idx::size, ...] = inp + grid = make_grid(canvas, nrow=len(inputs), padding=16, normalize=True, scale_each=True) + elif orientation == "vertical": + # interleave the outputs and references + canvas = torch.cat(inputs, dim=0) + size = len(inputs) + for idx, inp in enumerate(inputs): + canvas[idx::size, ...] = inp + grid = make_grid(canvas, nrow=len(inputs[0]), padding=16, normalize=True, scale_each=True) + else: + raise ValueError("Unknown orientation: {}".format(orientation)) + return grid + + +@torch.no_grad() +def make_training_sample_grid( + left_images: Tensor, + right_images: Tensor, + disparities: Tensor, + masks: Tensor, + predictions: List[Tensor], +) -> npt.NDArray: + # detach images and renormalize to [0, 1] + images_left = left_images.detach().cpu() * 0.5 + 0.5 + images_right = right_images.detach().cpu() * 0.5 + 0.5 + # detach the disparties and predictions + disparities = disparities.detach().cpu() + predictions = predictions[-1].detach().cpu() + # keep only the first channel of pixels, and repeat it 3 times + disparities = disparities[:, :1, ...].repeat(1, 3, 1, 1) + predictions = predictions[:, :1, ...].repeat(1, 3, 1, 1) + # unsqueeze and repeat the masks + masks = masks.detach().cpu().unsqueeze(1).repeat(1, 3, 1, 1) + # make a grid that will self normalize across the batch + pred_grid = make_pair_grid(images_left, images_right, masks, disparities, predictions, orientation="horizontal") + pred_grid = pred_grid.permute(1, 2, 0).numpy() + pred_grid = (pred_grid * 255).astype(np.uint8) + return pred_grid + + +@torch.no_grad() +def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> npt.NDArray: + # right most we will be adding the ground truth + seq_len = len(predictions) + 1 + predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities])) + sequence = make_disparity_sequence(predictions) + # swap axes to have the in the correct order for each batch sample + sequence = torch.swapaxes(sequence, 0, 1).contiguous().reshape(-1, 1, disparities.shape[-2], disparities.shape[-1]) + sequence = make_grid(sequence, nrow=seq_len, padding=16, normalize=True, scale_each=True) + sequence = sequence.permute(1, 2, 0).numpy() + sequence = (sequence * 255).astype(np.uint8) + return sequence + + +@torch.no_grad() +def make_prediction_image_side_to_side( + predictions: Tensor, disparities: Tensor, valid_mask: Tensor, save_path: str, prefix: str +) -> None: + import matplotlib.pyplot as plt + + # normalize the predictions and disparities in [0, 1] + predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min()) + disparities = (disparities - disparities.min()) / (disparities.max() - disparities.min()) + predictions = predictions * valid_mask + disparities = disparities * valid_mask + + predictions = predictions.detach().cpu() + disparities = disparities.detach().cpu() + + for idx, (pred, gt) in enumerate(zip(predictions, disparities)): + pred = pred.permute(1, 2, 0).numpy() + gt = gt.permute(1, 2, 0).numpy() + # plot pred and gt side by side + fig, ax = plt.subplots(1, 2, figsize=(10, 5)) + ax[0].imshow(pred) + ax[0].set_title("Prediction") + ax[1].imshow(gt) + ax[1].set_title("Ground Truth") + save_name = os.path.join(save_path, "{}_{}.png".format(prefix, idx)) + plt.savefig(save_name) + plt.close() diff --git a/references/detection/README.md b/references/detection/README.md index aec7c10e1b5..d9af26523a5 100644 --- a/references/detection/README.md +++ b/references/detection/README.md @@ -68,7 +68,7 @@ torchrun --nproc_per_node=8 train.py\ torchrun --nproc_per_node=8 train.py\ --dataset coco --model ssdlite320_mobilenet_v3_large --epochs 660\ --aspect-ratio-group-factor 3 --lr-scheduler cosineannealinglr --lr 0.15 --batch-size 24\ - --weight-decay 0.00004 --data-augmentation ssdlite --weights-backbone MobileNet_V3_Large_Weights.IMAGENET1K_V1 + --weight-decay 0.00004 --data-augmentation ssdlite ``` diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py index 396de63297b..44b917a6ec6 100644 --- a/references/detection/coco_utils.py +++ b/references/detection/coco_utils.py @@ -1,4 +1,3 @@ -import copy import os import torch @@ -9,24 +8,6 @@ from pycocotools.coco import COCO -class FilterAndRemapCocoCategories: - def __init__(self, categories, remap=True): - self.categories = categories - self.remap = remap - - def __call__(self, image, target): - anno = target["annotations"] - anno = [obj for obj in anno if obj["category_id"] in self.categories] - if not self.remap: - target["annotations"] = anno - return image, target - anno = copy.deepcopy(anno) - for obj in anno: - obj["category_id"] = self.categories.index(obj["category_id"]) - target["annotations"] = anno - return image, target - - def convert_coco_poly_to_mask(segmentations, height, width): masks = [] for polygons in segmentations: @@ -49,7 +30,6 @@ def __call__(self, image, target): w, h = image.size image_id = target["image_id"] - image_id = torch.tensor([image_id]) anno = target["annotations"] @@ -116,7 +96,7 @@ def _has_valid_annotation(anno): # if all boxes have close to zero area, there is no annotation if _has_only_empty_bbox(anno): return False - # keypoints task have a slight different critera for considering + # keypoints task have a slight different criteria for considering # if an annotation is valid if "keypoints" not in anno[0]: return True @@ -126,10 +106,6 @@ def _has_valid_annotation(anno): return True return False - if not isinstance(dataset, torchvision.datasets.CocoDetection): - raise TypeError( - f"This function expects dataset of type torchvision.datasets.CocoDetection, instead got {type(dataset)}" - ) ids = [] for ds_idx, img_id in enumerate(dataset.ids): ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) @@ -147,13 +123,13 @@ def convert_to_coco_api(ds): coco_ds = COCO() # annotation IDs need to start at 1, not 0, see torchvision issue #1530 ann_id = 1 - dataset = {"images": [], "categories": [], "annotations": []} + dataset = {"images": [], "categories": [], "annotations": [], "info": {}} categories = set() for img_idx in range(len(ds)): # find better way to get target # targets = ds.get_annotations(img_idx) img, targets = ds[img_idx] - image_id = targets["image_id"].item() + image_id = targets["image_id"] img_dict = {} img_dict["id"] = image_id img_dict["height"] = img.shape[-2] @@ -196,6 +172,7 @@ def convert_to_coco_api(ds): def get_coco_api_from_dataset(dataset): + # FIXME: This is... awful? for _ in range(10): if isinstance(dataset, torchvision.datasets.CocoDetection): break @@ -220,7 +197,7 @@ def __getitem__(self, idx): return img, target -def get_coco(root, image_set, transforms, mode="instances"): +def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False): anno_file_template = "{}_{}2017.json" PATHS = { "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))), @@ -228,17 +205,26 @@ def get_coco(root, image_set, transforms, mode="instances"): # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))) } - t = [ConvertCocoPolysToMask()] - - if transforms is not None: - t.append(transforms) - transforms = T.Compose(t) - img_folder, ann_file = PATHS[image_set] img_folder = os.path.join(root, img_folder) ann_file = os.path.join(root, ann_file) - dataset = CocoDetection(img_folder, ann_file, transforms=transforms) + if use_v2: + from torchvision.datasets import wrap_dataset_for_transforms_v2 + + dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) + target_keys = ["boxes", "labels", "image_id"] + if with_masks: + target_keys += ["masks"] + dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys) + else: + # TODO: handle with_masks for V1? + t = [ConvertCocoPolysToMask()] + if transforms is not None: + t.append(transforms) + transforms = T.Compose(t) + + dataset = CocoDetection(img_folder, ann_file, transforms=transforms) if image_set == "train": dataset = _coco_remove_images_without_annotations(dataset) @@ -246,7 +232,3 @@ def get_coco(root, image_set, transforms, mode="instances"): # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)]) return dataset - - -def get_coco_kp(root, image_set, transforms): - return get_coco(root, image_set, transforms, mode="person_keypoints") diff --git a/references/detection/engine.py b/references/detection/engine.py index 0e5d55f189d..0e9bfffdf8a 100644 --- a/references/detection/engine.py +++ b/references/detection/engine.py @@ -26,7 +26,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) - targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets] with torch.cuda.amp.autocast(enabled=scaler is not None): loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) @@ -97,7 +97,7 @@ def evaluate(model, data_loader, device): outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] model_time = time.time() - model_time - res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} + res = {target["image_id"]: output for target, output in zip(targets, outputs)} evaluator_time = time.time() coco_evaluator.update(res) evaluator_time = time.time() - evaluator_time diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py index 5312cc036d6..d12e14b540c 100644 --- a/references/detection/group_by_aspect_ratio.py +++ b/references/detection/group_by_aspect_ratio.py @@ -63,7 +63,7 @@ def __iter__(self): expected_num_batches = len(self) num_remaining = expected_num_batches - num_batches if num_remaining > 0: - # for the remaining batches, take first the buffers with largest number + # for the remaining batches, take first the buffers with the largest number # of elements for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True): remaining = self.batch_size - len(buffer_per_group[group_id]) diff --git a/references/detection/presets.py b/references/detection/presets.py index 779f3f218ca..e9b6d56c886 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -1,73 +1,114 @@ +from collections import defaultdict + import torch -import transforms as T +import transforms as reference_transforms + + +def get_modules(use_v2): + # We need a protected import to avoid the V2 warning in case just V1 is used + if use_v2: + import torchvision.transforms.v2 + import torchvision.tv_tensors + + return torchvision.transforms.v2, torchvision.tv_tensors + else: + return reference_transforms, None class DetectionPresetTrain: - def __init__(self, *, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)): + # Note: this transform assumes that the input to forward() are always PIL + # images, regardless of the backend parameter. + def __init__( + self, + *, + data_augmentation, + hflip_prob=0.5, + mean=(123.0, 117.0, 104.0), + backend="pil", + use_v2=False, + ): + + T, tv_tensors = get_modules(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tv_tensor": + transforms.append(T.ToImage()) + elif backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + if data_augmentation == "hflip": - self.transforms = T.Compose( - [ - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [T.RandomHorizontalFlip(p=hflip_prob)] elif data_augmentation == "lsj": - self.transforms = T.Compose( - [ - T.ScaleJitter(target_size=(1024, 1024)), - T.FixedSizeCrop(size=(1024, 1024), fill=mean), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [ + T.ScaleJitter(target_size=(1024, 1024), antialias=True), + # TODO: FixedSizeCrop below doesn't work on tensors! + reference_transforms.FixedSizeCrop(size=(1024, 1024), fill=mean), + T.RandomHorizontalFlip(p=hflip_prob), + ] elif data_augmentation == "multiscale": - self.transforms = T.Compose( - [ - T.RandomShortestSize( - min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 - ), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [ + T.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333), + T.RandomHorizontalFlip(p=hflip_prob), + ] elif data_augmentation == "ssd": - self.transforms = T.Compose( - [ - T.RandomPhotometricDistort(), - T.RandomZoomOut(fill=list(mean)), - T.RandomIoUCrop(), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + fill = defaultdict(lambda: mean, {tv_tensors.Mask: 0}) if use_v2 else list(mean) + transforms += [ + T.RandomPhotometricDistort(), + T.RandomZoomOut(fill=fill), + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + ] elif data_augmentation == "ssdlite": - self.transforms = T.Compose( - [ - T.RandomIoUCrop(), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [ + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + ] else: raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"') + if backend == "pil": + # Note: we could just convert to pure tensors even in v2. + transforms += [T.ToImage() if use_v2 else T.PILToTensor()] + + transforms += [T.ToDtype(torch.float, scale=True)] + + if use_v2: + transforms += [ + T.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.XYXY), + T.SanitizeBoundingBoxes(), + T.ToPureTensor(), + ] + + self.transforms = T.Compose(transforms) + def __call__(self, img, target): return self.transforms(img, target) class DetectionPresetEval: - def __init__(self): - self.transforms = T.Compose( - [ - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + def __init__(self, backend="pil", use_v2=False): + T, _ = get_modules(use_v2) + transforms = [] + backend = backend.lower() + if backend == "pil": + # Note: we could just convert to pure tensors even in v2? + transforms += [T.ToImage() if use_v2 else T.PILToTensor()] + elif backend == "tensor": + transforms += [T.PILToTensor()] + elif backend == "tv_tensor": + transforms += [T.ToImage()] + else: + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + + transforms += [T.ToDtype(torch.float, scale=True)] + + if use_v2: + transforms += [T.ToPureTensor()] + + self.transforms = T.Compose(transforms) def __call__(self, img, target): return self.transforms(img, target) diff --git a/references/detection/train.py b/references/detection/train.py index dea483c5f75..6a9ffb0af4d 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -28,7 +28,7 @@ import torchvision.models.detection import torchvision.models.detection.mask_rcnn import utils -from coco_utils import get_coco, get_coco_kp +from coco_utils import get_coco from engine import evaluate, train_one_epoch from group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler from torchvision.transforms import InterpolationMode @@ -40,23 +40,32 @@ def copypaste_collate_fn(batch): return copypaste(*utils.collate_fn(batch)) -def get_dataset(name, image_set, transform, data_path): - paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)} - p, ds_fn, num_classes = paths[name] - - ds = ds_fn(p, image_set=image_set, transforms=transform) +def get_dataset(is_train, args): + image_set = "train" if is_train else "val" + num_classes, mode = {"coco": (91, "instances"), "coco_kp": (2, "person_keypoints")}[args.dataset] + with_masks = "mask" in args.model + ds = get_coco( + root=args.data_path, + image_set=image_set, + transforms=get_transform(is_train, args), + mode=mode, + use_v2=args.use_v2, + with_masks=with_masks, + ) return ds, num_classes -def get_transform(train, args): - if train: - return presets.DetectionPresetTrain(data_augmentation=args.data_augmentation) +def get_transform(is_train, args): + if is_train: + return presets.DetectionPresetTrain( + data_augmentation=args.data_augmentation, backend=args.backend, use_v2=args.use_v2 + ) elif args.weights and args.test_only: weights = torchvision.models.get_weight(args.weights) trans = weights.transforms() return lambda img, target: (trans(img), target) else: - return presets.DetectionPresetEval() + return presets.DetectionPresetEval(backend=args.backend, use_v2=args.use_v2) def get_args_parser(add_help=True): @@ -65,7 +74,12 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help) parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path") - parser.add_argument("--dataset", default="coco", type=str, help="dataset name") + parser.add_argument( + "--dataset", + default="coco", + type=str, + help="dataset name. Use coco for object detection and instance segmentation and coco_kp for Keypoint detection", + ) parser.add_argument("--model", default="maskrcnn_resnet50_fpn", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( @@ -159,10 +173,22 @@ def get_args_parser(add_help=True): help="Use CopyPaste data augmentation. Works only with data-augmentation='lsj'.", ) + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") + return parser def main(args): + if args.backend.lower() == "tv_tensor" and not args.use_v2: + raise ValueError("Use --use-v2 if you want to use the tv_tensor backend.") + if args.dataset not in ("coco", "coco_kp"): + raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}") + if "keypoint" in args.model and args.dataset != "coco_kp": + raise ValueError("Oops, if you want Keypoint detection, set --dataset coco_kp") + if args.dataset == "coco_kp" and args.use_v2: + raise ValueError("KeyPoint detection doesn't support V2 transforms yet") + if args.output_dir: utils.mkdir(args.output_dir) @@ -177,8 +203,8 @@ def main(args): # Data loading code print("Loading data") - dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path) - dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path) + dataset, num_classes = get_dataset(is_train=True, args=args) + dataset_test, _ = get_dataset(is_train=False, args=args) print("Creating data loaders") if args.distributed: @@ -262,7 +288,7 @@ def main(args): ) if args.resume: - checkpoint = torch.load(args.resume, map_location="cpu") + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True) model_without_ddp.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) diff --git a/references/detection/transforms.py b/references/detection/transforms.py index d26bf6eac85..e07ccfc9921 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -53,14 +53,17 @@ def forward( return image, target -class ConvertImageDtype(nn.Module): - def __init__(self, dtype: torch.dtype) -> None: +class ToDtype(nn.Module): + def __init__(self, dtype: torch.dtype, scale: bool = False) -> None: super().__init__() self.dtype = dtype + self.scale = scale def forward( self, image: Tensor, target: Optional[Dict[str, Tensor]] = None ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if not self.scale: + return image.to(dtype=self.dtype), target image = F.convert_image_dtype(image, self.dtype) return image, target @@ -293,11 +296,13 @@ def __init__( target_size: Tuple[int, int], scale_range: Tuple[float, float] = (0.1, 2.0), interpolation: InterpolationMode = InterpolationMode.BILINEAR, + antialias=True, ): super().__init__() self.target_size = target_size self.scale_range = scale_range self.interpolation = interpolation + self.antialias = antialias def forward( self, image: Tensor, target: Optional[Dict[str, Tensor]] = None @@ -315,14 +320,17 @@ def forward( new_width = int(orig_width * r) new_height = int(orig_height * r) - image = F.resize(image, [new_height, new_width], interpolation=self.interpolation) + image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias) if target is not None: target["boxes"][:, 0::2] *= new_width / orig_width target["boxes"][:, 1::2] *= new_height / orig_height if "masks" in target: target["masks"] = F.resize( - target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST + target["masks"], + [new_height, new_width], + interpolation=InterpolationMode.NEAREST, + antialias=self.antialias, ) return image, target diff --git a/references/optical_flow/README.md b/references/optical_flow/README.md index a7ac0223739..6ad1d4079f7 100644 --- a/references/optical_flow/README.md +++ b/references/optical_flow/README.md @@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 --nnodes 1 train.py --val-dataset sintel --batch-siz This should give an epe of about 1.3822 on the clean pass and 2.7161 on the final pass of Sintel-train. Results may vary slightly depending on the batch -size and the number of GPUs. For the most accurate resuts use 1 GPU and +size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`: ``` diff --git a/references/optical_flow/train.py b/references/optical_flow/train.py index be6ffe4ccef..7012ea6f810 100644 --- a/references/optical_flow/train.py +++ b/references/optical_flow/train.py @@ -82,7 +82,7 @@ def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, b def inner_loop(blob): if blob[0].dim() == 3: - # input is not batched so we add an extra dim for consistency + # input is not batched, so we add an extra dim for consistency blob = [x[None, :, :, :] if x is not None else None for x in blob] image1, image2, flow_gt = blob[:3] @@ -150,7 +150,7 @@ def preprocessing(img1, img2, flow, valid_flow_mask): for name in val_datasets: if name == "kitti": - # Kitti has different image sizes so we need to individually pad them, we can't batch. + # Kitti has different image sizes, so we need to individually pad them, we can't batch. # see comment in InputPadder if args.batch_size != 1 and (not args.distributed or args.rank == 0): warnings.warn( @@ -226,7 +226,7 @@ def main(args): model_without_ddp = model if args.resume is not None: - checkpoint = torch.load(args.resume, map_location="cpu") + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True) model_without_ddp.load_state_dict(checkpoint["model"]) if args.test_only: diff --git a/references/optical_flow/transforms.py b/references/optical_flow/transforms.py index 6011608183a..bc831a2ee52 100644 --- a/references/optical_flow/transforms.py +++ b/references/optical_flow/transforms.py @@ -164,7 +164,7 @@ class RandomResizeAndCrop(torch.nn.Module): # The reason we don't rely on RandomResizedCrop is because of a significant # difference in the parametrization of both transforms, in particular, # because of the way the random parameters are sampled in both transforms, - # which leads to fairly different resuts (and different epe). For more details see + # which leads to fairly different results (and different epe). For more details see # https://github.com/pytorch/vision/pull/5026/files#r762932579 def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, stretch_prob=0.8): super().__init__() @@ -196,8 +196,12 @@ def forward(self, img1, img2, flow, valid_flow_mask): if torch.rand(1).item() < self.resize_prob: # rescale the images - img1 = F.resize(img1, size=(new_h, new_w)) - img2 = F.resize(img2, size=(new_h, new_w)) + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the OF models with antialias=True? + img1 = F.resize(img1, size=(new_h, new_w), antialias=False) + img2 = F.resize(img2, size=(new_h, new_w), antialias=False) if valid_flow_mask is None: flow = F.resize(flow, size=(new_h, new_w)) flow = flow * torch.tensor([scale_x, scale_y])[:, None, None] @@ -208,7 +212,7 @@ def forward(self, img1, img2, flow, valid_flow_mask): # Note: For sparse datasets (Kitti), the original code uses a "margin" # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220 - # We don't, not sure it matters much + # We don't, not sure if it matters much y0 = torch.randint(0, img1.shape[1] - self.crop_size[0], size=(1,)).item() x0 = torch.randint(0, img1.shape[2] - self.crop_size[1], size=(1,)).item() diff --git a/references/optical_flow/utils.py b/references/optical_flow/utils.py index 8b07e9de35c..cd4b16eb0d8 100644 --- a/references/optical_flow/utils.py +++ b/references/optical_flow/utils.py @@ -181,7 +181,7 @@ def sequence_loss(flow_preds, flow_gt, valid_flow_mask, gamma=0.8, max_flow=400) if gamma > 1: raise ValueError(f"Gamma should be < 1, got {gamma}.") - # exlude invalid pixels and extremely large diplacements + # exclude invalid pixels and extremely large diplacements flow_norm = torch.sum(flow_gt**2, dim=1).sqrt() valid_flow_mask = valid_flow_mask & (flow_norm < max_flow) @@ -248,7 +248,7 @@ def setup_ddp(args): # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2 if all(key in os.environ for key in ("LOCAL_RANK", "RANK", "WORLD_SIZE")): - # if we're here, the script was called with torchrun. Otherwise + # if we're here, the script was called with torchrun. Otherwise, # these args will be set already by the run_with_submitit script args.local_rank = int(os.environ["LOCAL_RANK"]) args.rank = int(os.environ["RANK"]) diff --git a/references/segmentation/README.md b/references/segmentation/README.md index 2c7391c8380..2c8e581dac1 100644 --- a/references/segmentation/README.md +++ b/references/segmentation/README.md @@ -1,7 +1,7 @@ # Semantic segmentation reference training scripts This folder contains reference training scripts for semantic segmentation. -They serve as a log of how to train specific models, as provide baseline +They serve as a log of how to train specific models and provide baseline training and evaluation scripts to quickly bootstrap research. All models have been trained on 8x V100 GPUs. diff --git a/references/segmentation/coco_utils.py b/references/segmentation/coco_utils.py index e02434012f1..6a15dbefb52 100644 --- a/references/segmentation/coco_utils.py +++ b/references/segmentation/coco_utils.py @@ -68,11 +68,6 @@ def _has_valid_annotation(anno): # if more than 1k pixels occupied in the image return sum(obj["area"] for obj in anno) > 1000 - if not isinstance(dataset, torchvision.datasets.CocoDetection): - raise TypeError( - f"This function expects dataset of type torchvision.datasets.CocoDetection, instead got {type(dataset)}" - ) - ids = [] for ds_idx, img_id in enumerate(dataset.ids): ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) @@ -86,7 +81,7 @@ def _has_valid_annotation(anno): return dataset -def get_coco(root, image_set, transforms): +def get_coco(root, image_set, transforms, use_v2=False): PATHS = { "train": ("train2017", os.path.join("annotations", "instances_train2017.json")), "val": ("val2017", os.path.join("annotations", "instances_val2017.json")), @@ -94,13 +89,24 @@ def get_coco(root, image_set, transforms): } CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72] - transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms]) - img_folder, ann_file = PATHS[image_set] img_folder = os.path.join(root, img_folder) ann_file = os.path.join(root, ann_file) - dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) + # The 2 "Compose" below achieve the same thing: converting coco detection + # samples into segmentation-compatible samples. They just do it with + # slightly different implementations. We could refactor and unify, but + # keeping them separate helps keeping the v2 version clean + if use_v2: + import v2_extras + from torchvision.datasets import wrap_dataset_for_transforms_v2 + + transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms]) + dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) + dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"}) + else: + transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms]) + dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) if image_set == "train": dataset = _coco_remove_images_without_annotations(dataset, CAT_LIST) diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py index ed02ae660e4..803769fcafc 100644 --- a/references/segmentation/presets.py +++ b/references/segmentation/presets.py @@ -1,39 +1,109 @@ import torch -import transforms as T + + +def get_modules(use_v2): + # We need a protected import to avoid the V2 warning in case just V1 is used + if use_v2: + import torchvision.transforms.v2 + import torchvision.tv_tensors + import v2_extras + + return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras + else: + import transforms + + return transforms, None, None class SegmentationPresetTrain: - def __init__(self, *, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): - min_size = int(0.5 * base_size) - max_size = int(2.0 * base_size) + def __init__( + self, + *, + base_size, + crop_size, + hflip_prob=0.5, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + backend="pil", + use_v2=False, + ): + T, tv_tensors, v2_extras = get_modules(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tv_tensor": + transforms.append(T.ToImage()) + elif backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + + transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))] - trans = [T.RandomResize(min_size, max_size)] if hflip_prob > 0: - trans.append(T.RandomHorizontalFlip(hflip_prob)) - trans.extend( - [ - T.RandomCrop(crop_size), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - T.Normalize(mean=mean, std=std), + transforms += [T.RandomHorizontalFlip(hflip_prob)] + + if use_v2: + # We need a custom pad transform here, since the padding we want to perform here is fundamentally + # different from the padding in `RandomCrop` if `pad_if_needed=True`. + transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})] + + transforms += [T.RandomCrop(crop_size)] + + if backend == "pil": + transforms += [T.PILToTensor()] + + if use_v2: + img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor + transforms += [ + T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True) ] - ) - self.transforms = T.Compose(trans) + else: + # No need to explicitly convert masks as they're magically int64 already + transforms += [T.ToDtype(torch.float, scale=True)] + + transforms += [T.Normalize(mean=mean, std=std)] + if use_v2: + transforms += [T.ToPureTensor()] + + self.transforms = T.Compose(transforms) def __call__(self, img, target): return self.transforms(img, target) class SegmentationPresetEval: - def __init__(self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): - self.transforms = T.Compose( - [ - T.RandomResize(base_size, base_size), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - T.Normalize(mean=mean, std=std), - ] - ) + def __init__( + self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False + ): + T, _, _ = get_modules(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tensor": + transforms += [T.PILToTensor()] + elif backend == "tv_tensor": + transforms += [T.ToImage()] + elif backend != "pil": + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + + if use_v2: + transforms += [T.Resize(size=(base_size, base_size))] + else: + transforms += [T.RandomResize(min_size=base_size, max_size=base_size)] + + if backend == "pil": + # Note: we could just convert to pure tensors even in v2? + transforms += [T.ToImage() if use_v2 else T.PILToTensor()] + + transforms += [ + T.ToDtype(torch.float, scale=True), + T.Normalize(mean=mean, std=std), + ] + if use_v2: + transforms += [T.ToPureTensor()] + + self.transforms = T.Compose(transforms) def __call__(self, img, target): return self.transforms(img, target) diff --git a/references/segmentation/train.py b/references/segmentation/train.py index bb57e65b801..abdc3c6aacb 100644 --- a/references/segmentation/train.py +++ b/references/segmentation/train.py @@ -14,24 +14,30 @@ from torchvision.transforms import functional as F, InterpolationMode -def get_dataset(dir_path, name, image_set, transform): +def get_dataset(args, is_train): def sbd(*args, **kwargs): + kwargs.pop("use_v2") return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs) + def voc(*args, **kwargs): + kwargs.pop("use_v2") + return torchvision.datasets.VOCSegmentation(*args, **kwargs) + paths = { - "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21), - "voc_aug": (dir_path, sbd, 21), - "coco": (dir_path, get_coco, 21), + "voc": (args.data_path, voc, 21), + "voc_aug": (args.data_path, sbd, 21), + "coco": (args.data_path, get_coco, 21), } - p, ds_fn, num_classes = paths[name] + p, ds_fn, num_classes = paths[args.dataset] - ds = ds_fn(p, image_set=image_set, transforms=transform) + image_set = "train" if is_train else "val" + ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2) return ds, num_classes -def get_transform(train, args): - if train: - return presets.SegmentationPresetTrain(base_size=520, crop_size=480) +def get_transform(is_train, args): + if is_train: + return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2) elif args.weights and args.test_only: weights = torchvision.models.get_weight(args.weights) trans = weights.transforms() @@ -44,7 +50,7 @@ def preprocessing(img, target): return preprocessing else: - return presets.SegmentationPresetEval(base_size=520) + return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2) def criterion(inputs, target): @@ -120,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi def main(args): + if args.backend.lower() != "pil" and not args.use_v2: + # TODO: Support tensor backend in V1? + raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.") + if args.use_v2 and args.dataset != "coco": + raise ValueError("v2 is only support supported for coco dataset for now.") + if args.output_dir: utils.mkdir(args.output_dir) @@ -134,8 +146,8 @@ def main(args): else: torch.backends.cudnn.benchmark = True - dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args)) - dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args)) + dataset, num_classes = get_dataset(args, is_train=True) + dataset_test, _ = get_dataset(args, is_train=False) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) @@ -211,7 +223,7 @@ def main(args): lr_scheduler = main_lr_scheduler if args.resume: - checkpoint = torch.load(args.resume, map_location="cpu") + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True) model_without_ddp.load_state_dict(checkpoint["model"], strict=not args.test_only) if not args.test_only: optimizer.load_state_dict(checkpoint["optimizer"]) @@ -260,7 +272,7 @@ def get_args_parser(add_help=True): parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path") parser.add_argument("--dataset", default="coco", type=str, help="dataset name") parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name") - parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss") + parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( "-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size" @@ -307,6 +319,8 @@ def get_args_parser(add_help=True): # Mixed precision training parameters parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training") + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") return parser diff --git a/references/segmentation/transforms.py b/references/segmentation/transforms.py index 518048db2fa..6934b9f862e 100644 --- a/references/segmentation/transforms.py +++ b/references/segmentation/transforms.py @@ -35,7 +35,7 @@ def __init__(self, min_size, max_size=None): def __call__(self, image, target): size = random.randint(self.min_size, self.max_size) - image = F.resize(image, size) + image = F.resize(image, size, antialias=True) target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST) return image, target @@ -81,11 +81,14 @@ def __call__(self, image, target): return image, target -class ConvertImageDtype: - def __init__(self, dtype): +class ToDtype: + def __init__(self, dtype, scale=False): self.dtype = dtype + self.scale = scale def __call__(self, image, target): + if not self.scale: + return image.to(dtype=self.dtype), target image = F.convert_image_dtype(image, self.dtype) return image, target diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py index 4ea24db83ed..92db1899851 100644 --- a/references/segmentation/utils.py +++ b/references/segmentation/utils.py @@ -88,7 +88,7 @@ def compute(self): return acc_global, acc, iu def reduce_from_all_processes(self): - reduce_across_processes(self.mat) + self.mat = reduce_across_processes(self.mat).to(torch.int64) def __str__(self): acc_global, acc, iu = self.compute() @@ -267,9 +267,9 @@ def init_distributed_mode(args): args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ["WORLD_SIZE"]) args.gpu = int(os.environ["LOCAL_RANK"]) - elif "SLURM_PROCID" in os.environ: - args.rank = int(os.environ["SLURM_PROCID"]) - args.gpu = args.rank % torch.cuda.device_count() + # elif "SLURM_PROCID" in os.environ: + # args.rank = int(os.environ["SLURM_PROCID"]) + # args.gpu = args.rank % torch.cuda.device_count() elif hasattr(args, "rank"): pass else: diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py new file mode 100644 index 00000000000..2d9eb3e661a --- /dev/null +++ b/references/segmentation/v2_extras.py @@ -0,0 +1,83 @@ +"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1.""" +import torch +from torchvision import tv_tensors +from torchvision.transforms import v2 + + +class PadIfSmaller(v2.Transform): + def __init__(self, size, fill=0): + super().__init__() + self.size = size + self.fill = v2._utils._setup_fill_arg(fill) + + def make_params(self, sample): + _, height, width = v2._utils.query_chw(sample) + padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] + needs_padding = any(padding) + return dict(padding=padding, needs_padding=needs_padding) + + def transform(self, inpt, params): + if not params["needs_padding"]: + return inpt + + fill = v2._utils._get_fill(self.fill, type(inpt)) + fill = v2._utils._convert_fill_arg(fill) + + return v2.functional.pad(inpt, padding=params["padding"], fill=fill) + + +class CocoDetectionToVOCSegmentation(v2.Transform): + """Turn samples from datasets.CocoDetection into the same format as VOCSegmentation. + + This is achieved in two steps: + + 1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately, + the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not + present in VOC are dropped and replaced by background. + 2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual + mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where + the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation + mask while pixels that belong to multiple detection masks are marked as invalid. + """ + + COCO_TO_VOC_LABEL_MAP = dict( + zip( + [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72], + range(21), + ) + ) + INVALID_VALUE = 255 + + def _coco_detection_masks_to_voc_segmentation_mask(self, target): + if "masks" not in target: + return None + + instance_masks, instance_labels_coco = target["masks"], target["labels"] + + valid_labels_voc = [ + (idx, label_voc) + for idx, label_coco in enumerate(instance_labels_coco.tolist()) + if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None + ] + + if not valid_labels_voc: + return None + + valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc) + + instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8) + instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8) + + # Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as + # there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step. + segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0) + segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE + + return segmentation_mask + + def forward(self, image, target): + segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target) + if segmentation_mask is None: + segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8) + + return image, tv_tensors.Mask(segmentation_mask) diff --git a/references/similarity/sampler.py b/references/similarity/sampler.py index f4564eca33e..fe6517418ab 100644 --- a/references/similarity/sampler.py +++ b/references/similarity/sampler.py @@ -48,7 +48,7 @@ def __init__(self, groups, p, k): # Ensures there are enough classes to sample from if len(self.groups) < p: - raise ValueError("There are not enought classes to sample from") + raise ValueError("There are not enough classes to sample from") def __iter__(self): # Shuffle samples within groups diff --git a/references/similarity/train.py b/references/similarity/train.py index 146e2bef688..7686729927e 100644 --- a/references/similarity/train.py +++ b/references/similarity/train.py @@ -101,7 +101,7 @@ def main(args): model = EmbeddingNet() if args.resume: - model.load_state_dict(torch.load(args.resume)) + model.load_state_dict(torch.load(args.resume, weights_only=True)) model.to(device) diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 9bd1b9cc285..39c5d8f1bba 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -76,11 +76,12 @@ Input data augmentations at validation time (with optional parameters): 5. Convert BCHW to CBHW This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the -batch size per GPU. Moreover note that our default `--lr` is configured for 64 GPUs which is how many we used for the +batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the Video resnet models: ``` # number of frames per clip --clip_len 16 \ +--frame-rate 15 \ # allow for temporal jittering --clips_per_video 5 \ --batch-size 24 \ @@ -97,6 +98,21 @@ Video resnet models: --val-crop-size 112 112 ``` +### S3D + +The S3D model was trained similarly to the above but with the following changes on the default configuration: +``` +--batch-size=12 --lr 0.2 --clip-len 64 --clips-per-video 5 --sync-bn \ +--train-resize-size 256 256 --train-crop-size 224 224 --val-resize-size 256 256 --val-crop-size 224 224 +``` + +We used 64 GPUs to train the architecture. + +To estimate the validation statistics of the model, we run the reference script with the following configuration: +``` +--batch-size=16 --test-only --clip-len 128 --clips-per-video 1 +``` + ### Additional video modelling resources - [Video Model Zoo](https://github.com/facebookresearch/VMZ) diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py index ef774052257..f73802c9666 100644 --- a/references/video_classification/presets.py +++ b/references/video_classification/presets.py @@ -15,7 +15,11 @@ def __init__( ): trans = [ transforms.ConvertImageDtype(torch.float32), - transforms.Resize(resize_size), + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the video models with antialias=True? + transforms.Resize(resize_size, antialias=False), ] if hflip_prob > 0: trans.append(transforms.RandomHorizontalFlip(hflip_prob)) @@ -31,7 +35,11 @@ def __init__(self, *, crop_size, resize_size, mean=(0.43216, 0.394666, 0.37645), self.transforms = transforms.Compose( [ transforms.ConvertImageDtype(torch.float32), - transforms.Resize(resize_size), + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the video models with antialias=True? + transforms.Resize(resize_size, antialias=False), transforms.Normalize(mean=mean, std=std), transforms.CenterCrop(crop_size), ConvertBCHWtoCBHW(), diff --git a/references/video_classification/train.py b/references/video_classification/train.py index e26231bb914..a03a9722003 100644 --- a/references/video_classification/train.py +++ b/references/video_classification/train.py @@ -164,7 +164,7 @@ def main(args): if args.cache_dataset and os.path.exists(cache_path): print(f"Loading dataset_train from {cache_path}") - dataset, _ = torch.load(cache_path) + dataset, _ = torch.load(cache_path, weights_only=False) dataset.transform = transform_train else: if args.distributed: @@ -201,7 +201,7 @@ def main(args): if args.cache_dataset and os.path.exists(cache_path): print(f"Loading dataset_test from {cache_path}") - dataset_test, _ = torch.load(cache_path) + dataset_test, _ = torch.load(cache_path, weights_only=False) dataset_test.transform = transform_test else: if args.distributed: @@ -295,7 +295,7 @@ def main(args): model_without_ddp = model.module if args.resume: - checkpoint = torch.load(args.resume, map_location="cpu") + checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True) model_without_ddp.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) diff --git a/release/README.md b/release/README.md new file mode 100644 index 00000000000..830f964e531 --- /dev/null +++ b/release/README.md @@ -0,0 +1,3 @@ +# Vision Release Scripts + +This folder contains script(s) used for releasing new versions of the Vision package diff --git a/release/apply-release-changes.py b/release/apply-release-changes.py new file mode 100644 index 00000000000..22dd37216f8 --- /dev/null +++ b/release/apply-release-changes.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +""" +apply-release-changes.py - Cross-platform script to replace main with a specified release version in YML files + +This script performs two replacements in YML files in .github/workflows/: +1. Replaces @main with @release/VERSION +2. Replaces 'test-infra-ref: main' with 'test-infra-ref: release/VERSION' + +Usage: + python apply-release-changes.py VERSION + +Example: + python apply-release-changes.py 2.7 +""" + +import os +import pathlib +import sys +from typing import Optional + + +def replace_in_file(file_path: pathlib.Path, old_text: str, new_text: str) -> None: + """Replace all occurrences of old_text with new_text in the specified file.""" + try: + # Try reading the file without specifying encoding to use the default + encoding = None + try: + content = file_path.read_text() + except UnicodeDecodeError: + # If that fails, try with UTF-8 + encoding = "utf-8" + content = file_path.read_text(encoding=encoding) + + # Perform the replacement + new_content = content.replace(old_text, new_text) + + # Only write if changes were made + if new_content != content: + # Write with the same encoding we used to read + if encoding: + file_path.write_text(new_content, encoding=encoding) + else: + file_path.write_text(new_content) + print(f"Updated: {file_path}") + + except Exception as e: + print(f"Error processing {file_path}: {e}") + + +def find_repo_root() -> Optional[pathlib.Path]: + """Find the git repository root by searching for .git directory.""" + # Start from the current directory and traverse upwards + current_path = pathlib.Path.cwd().absolute() + + while current_path != current_path.parent: + # Check if .git directory exists + git_dir = current_path / ".git" + if git_dir.exists() and git_dir.is_dir(): + return current_path + + # Move up one directory + current_path = current_path.parent + + # If we get here, we didn't find a repository root + return None + + +def main() -> None: + # Check if version is provided as command line argument + if len(sys.argv) != 2: + print("Error: Exactly one version parameter is required") + print(f"Usage: python {os.path.basename(__file__)} VERSION") + print("Example: python apply-release-changes.py 2.7") + sys.exit(1) + + # Get version from command line argument + version = sys.argv[1] + print(f"Using release version: {version}") + + # Find the repository root by searching for .git directory + repo_root = find_repo_root() + if not repo_root: + print("Error: Not inside a git repository. Please run from within a git repository.") + sys.exit(1) + + print(f"Repository root found at: {repo_root}") + + # Get path to workflow directory + workflow_dir = repo_root / ".github" / "workflows" + + # Process all workflow files and perform both replacements on each file + for yml_file in workflow_dir.glob("*.yml"): + replace_in_file(yml_file, "@main", f"@release/{version}") + replace_in_file(yml_file, "test-infra-ref: main", f"test-infra-ref: release/{version}") + + +if __name__ == "__main__": + print("Starting YML updates...") + main() + print("YML updates completed.") diff --git a/scripts/download_model_urls.py b/scripts/download_model_urls.py new file mode 100644 index 00000000000..f5f53d71e98 --- /dev/null +++ b/scripts/download_model_urls.py @@ -0,0 +1,41 @@ +import asyncio +import sys +from pathlib import Path +from time import perf_counter +from urllib.parse import urlsplit + +import aiofiles +import aiohttp +from torchvision import models +from tqdm.asyncio import tqdm + + +async def main(download_root): + download_root.mkdir(parents=True, exist_ok=True) + urls = {weight.url for name in models.list_models() for weight in iter(models.get_model_weights(name))} + + async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=None)) as session: + await tqdm.gather(*[download(download_root, session, url) for url in urls]) + + +async def download(download_root, session, url): + response = await session.get(url, params=dict(source="ci")) + + assert response.ok + + file_name = Path(urlsplit(url).path).name + async with aiofiles.open(download_root / file_name, "wb") as f: + async for data in response.content.iter_any(): + await f.write(data) + + +if __name__ == "__main__": + download_root = ( + (Path(sys.argv[1]) if len(sys.argv) > 1 else Path("~/.cache/torch/hub/checkpoints")).expanduser().resolve() + ) + print(f"Downloading model weights to {download_root}") + start = perf_counter() + asyncio.get_event_loop().run_until_complete(main(download_root)) + stop = perf_counter() + minutes, seconds = divmod(stop - start, 60) + print(f"Download took {minutes:2.0f}m {seconds:2.0f}s") diff --git a/scripts/release_notes/classify_prs.py b/scripts/release_notes/classify_prs.py index de55c299381..5847c9f03f5 100644 --- a/scripts/release_notes/classify_prs.py +++ b/scripts/release_notes/classify_prs.py @@ -1,40 +1,22 @@ # In[1]: - -# imports and set configuration import pandas as pd -from retrieve_prs_data import run - -exclude_prototype = True -data_filename = "10.0_to_11.0-rc2.json" -previous_release = "v10.0" -current_release = "v11.0-rc2" # In[2]: - - +data_filename = "data.json" df = pd.read_json(data_filename).T df.tail() - # In[3]: - - all_labels = {lbl for labels in df["labels"] for lbl in labels} all_labels - # In[4]: - - # Add one column per label for label in all_labels: df[label] = df["labels"].apply(lambda labels_list: label in labels_list) df.head() - # In[5]: - - # Add a clean "module" column. It contains tuples since PRs can have more than one module. # Maybe we should include "topics" in that column as well? @@ -51,24 +33,15 @@ df["module"] = df.module.apply(tuple) df.head() - # In[6]: - - mod_df = df.set_index("module").sort_index() mod_df.tail() - # In[7]: - - # All improvement PRs mod_df[mod_df["enhancement"]].head() - # In[8]: - - # improvement f module # note: don't filter module name on the index as the index contain tuples with non-exclusive values # Use the boolean column instead @@ -76,12 +49,10 @@ # In[9]: - - -def format_prs(mod_df): +def format_prs(mod_df, exclude_prototype=True): out = [] for idx, row in mod_df.iterrows(): - if exclude_prototype and row["prototype"]: + if exclude_prototype and "prototype" in row and row["prototype"]: continue modules = idx # Put "documentation" and "tests" first for sorting to be dece @@ -98,8 +69,6 @@ def format_prs(mod_df): # In[10]: - - included_prs = pd.DataFrame() # If labels are accurate, this shouhld generate most of the release notes already @@ -112,27 +81,40 @@ def format_prs(mod_df): ("Bug Fixes", "bug"), ("Code Quality", "code quality"), ): - print(f"## {section_title}") - print() - tmp_df = mod_df[mod_df[module_idx]] - included_prs = pd.concat([included_prs, tmp_df]) - print(format_prs(tmp_df)) - print() + if module_idx in mod_df: + print(f"## {section_title}") + print() + tmp_df = mod_df[mod_df[module_idx]] + included_prs = pd.concat([included_prs, tmp_df]) + print(format_prs(tmp_df)) + print() # In[11]: - - # Missing PRs are these ones... classify them manually missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False) print(format_prs(missing_prs)) # In[12]: - # Generate list of contributors print() print("## Contributors") -command_to_run = f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *$//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/, $//' | fold -s" -rc, output, err = run(command_to_run) -print(output) +previous_release = "c35d3855ccbfa6a36e6ae6337a1f2c721c1f1e78" +current_release = "5181a854d8b127cf465cd22a67c1b5aaf6ccae05" +print( + f"{{ git shortlog -s {previous_release}..{current_release} | cut -f2- & git log -s {previous_release}..{current_release} | grep Co-authored | cut -f2- -d: | cut -f1 -d\\< | sed 's/^ *//;s/ *//' ; }} | sort --ignore-case | uniq | tr '\\n' ';' | sed 's/;/, /g;s/,//' | fold -s" +) + +# In[13]: +# Utility to extract PR numbers only from multiple lines, useful to bundle all +# the docs changes for example: +import re + +s = """ + +[] Remove unnecessary dependency from macOS/Conda binaries (#8077) +[rocm] [ROCm] remove HCC references (#8070) +""" + +print(", ".join(re.findall("(#\\d+)", s))) diff --git a/setup.cfg b/setup.cfg index f36195194cd..0f4ddbfab10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ universal=1 [metadata] -license_file = LICENSE +license_files = LICENSE [pep8] max-line-length = 120 @@ -10,7 +10,7 @@ max-line-length = 120 [flake8] # note: we ignore all 501s (line too long) anyway as they're taken care of by black max-line-length = 120 -ignore = E203, E402, W503, W504, F821, E501 +ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE per-file-ignores = __init__.py: F401, F403, F405 ./hubconf.py: F401 diff --git a/setup.py b/setup.py index 54319451521..5e69fa50f52 100644 --- a/setup.py +++ b/setup.py @@ -2,50 +2,85 @@ import distutils.spawn import glob import os +import shlex import shutil import subprocess import sys +import warnings +from pathlib import Path import torch from pkg_resources import DistributionNotFound, get_distribution, parse_version from setuptools import find_packages, setup -from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME, CUDAExtension +from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDA_HOME, CUDAExtension, ROCM_HOME + +FORCE_CUDA = os.getenv("FORCE_CUDA", "0") == "1" +FORCE_MPS = os.getenv("FORCE_MPS", "0") == "1" +DEBUG = os.getenv("DEBUG", "0") == "1" +USE_PNG = os.getenv("TORCHVISION_USE_PNG", "1") == "1" +USE_JPEG = os.getenv("TORCHVISION_USE_JPEG", "1") == "1" +USE_WEBP = os.getenv("TORCHVISION_USE_WEBP", "1") == "1" +USE_NVJPEG = os.getenv("TORCHVISION_USE_NVJPEG", "1") == "1" +NVCC_FLAGS = os.getenv("NVCC_FLAGS", None) +# Note: the GPU video decoding stuff used to be called "video codec", which +# isn't an accurate or descriptive name considering there are at least 2 other +# video decoding backends in torchvision. I'm renaming this to "gpu video +# decoder" where possible, keeping user facing names (like the env var below) to +# the old scheme for BC. +USE_GPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1" +# Same here: "use ffmpeg" was used to denote "use cpu video decoder". +USE_CPU_VIDEO_DECODER = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1" + +TORCHVISION_INCLUDE = os.environ.get("TORCHVISION_INCLUDE", "") +TORCHVISION_LIBRARY = os.environ.get("TORCHVISION_LIBRARY", "") +TORCHVISION_INCLUDE = TORCHVISION_INCLUDE.split(os.pathsep) if TORCHVISION_INCLUDE else [] +TORCHVISION_LIBRARY = TORCHVISION_LIBRARY.split(os.pathsep) if TORCHVISION_LIBRARY else [] + +ROOT_DIR = Path(__file__).absolute().parent +CSRS_DIR = ROOT_DIR / "torchvision/csrc" +IS_ROCM = (torch.version.hip is not None) and (ROCM_HOME is not None) +BUILD_CUDA_SOURCES = (torch.cuda.is_available() and ((CUDA_HOME is not None) or IS_ROCM)) or FORCE_CUDA + +package_name = os.getenv("TORCHVISION_PACKAGE_NAME", "torchvision") + +print("Torchvision build configuration:") +print(f"{FORCE_CUDA = }") +print(f"{FORCE_MPS = }") +print(f"{DEBUG = }") +print(f"{USE_PNG = }") +print(f"{USE_JPEG = }") +print(f"{USE_WEBP = }") +print(f"{USE_NVJPEG = }") +print(f"{NVCC_FLAGS = }") +print(f"{USE_CPU_VIDEO_DECODER = }") +print(f"{USE_GPU_VIDEO_DECODER = }") +print(f"{TORCHVISION_INCLUDE = }") +print(f"{TORCHVISION_LIBRARY = }") +print(f"{IS_ROCM = }") +print(f"{BUILD_CUDA_SOURCES = }") + + +def get_version(): + with open(ROOT_DIR / "version.txt") as f: + version = f.readline().strip() + sha = "Unknown" - -def read(*names, **kwargs): - with open(os.path.join(os.path.dirname(__file__), *names), encoding=kwargs.get("encoding", "utf8")) as fp: - return fp.read() - - -def get_dist(pkgname): try: - return get_distribution(pkgname) - except DistributionNotFound: - return None - - -cwd = os.path.dirname(os.path.abspath(__file__)) + sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(ROOT_DIR)).decode("ascii").strip() + except Exception: + pass -version_txt = os.path.join(cwd, "version.txt") -with open(version_txt) as f: - version = f.readline().strip() -sha = "Unknown" -package_name = "torchvision" + if os.getenv("BUILD_VERSION"): + version = os.getenv("BUILD_VERSION") + elif sha != "Unknown": + version += "+" + sha[:7] -try: - sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd).decode("ascii").strip() -except Exception: - pass + return version, sha -if os.getenv("BUILD_VERSION"): - version = os.getenv("BUILD_VERSION") -elif sha != "Unknown": - version += "+" + sha[:7] - -def write_version_file(): - version_path = os.path.join(cwd, "torchvision", "version.py") - with open(version_path, "w") as f: +def write_version_file(version, sha): + # Exists for BC, probably completely useless. + with open(ROOT_DIR / "torchvision" / "version.py", "w") as f: f.write(f"__version__ = '{version}'\n") f.write(f"git_version = {repr(sha)}\n") f.write("from torchvision.extension import _check_cuda_version\n") @@ -53,178 +88,56 @@ def write_version_file(): f.write(" cuda = _check_cuda_version()\n") -pytorch_dep = "torch" -if os.getenv("PYTORCH_VERSION"): - pytorch_dep += "==" + os.getenv("PYTORCH_VERSION") - -requirements = [ - "typing_extensions", - "numpy", - "requests", - pytorch_dep, -] - -# Excluding 8.3.* because of https://github.com/pytorch/vision/issues/4934 -pillow_ver = " >= 5.3.0, !=8.3.*" -pillow_req = "pillow-simd" if get_dist("pillow-simd") is not None else "pillow" -requirements.append(pillow_req + pillow_ver) - - -def find_library(name, vision_include): - this_dir = os.path.dirname(os.path.abspath(__file__)) - build_prefix = os.environ.get("BUILD_PREFIX", None) - is_conda_build = build_prefix is not None - - library_found = False - conda_installed = False - lib_folder = None - include_folder = None - library_header = f"{name}.h" - - # Lookup in TORCHVISION_INCLUDE or in the package file - package_path = [os.path.join(this_dir, "torchvision")] - for folder in vision_include + package_path: - candidate_path = os.path.join(folder, library_header) - library_found = os.path.exists(candidate_path) - if library_found: - break - - if not library_found: - print(f"Running build on conda-build: {is_conda_build}") - if is_conda_build: - # Add conda headers/libraries - if os.name == "nt": - build_prefix = os.path.join(build_prefix, "Library") - include_folder = os.path.join(build_prefix, "include") - lib_folder = os.path.join(build_prefix, "lib") - library_header_path = os.path.join(include_folder, library_header) - library_found = os.path.isfile(library_header_path) - conda_installed = library_found - else: - # Check if using Anaconda to produce wheels - conda = shutil.which("conda") - is_conda = conda is not None - print(f"Running build on conda: {is_conda}") - if is_conda: - python_executable = sys.executable - py_folder = os.path.dirname(python_executable) - if os.name == "nt": - env_path = os.path.join(py_folder, "Library") - else: - env_path = os.path.dirname(py_folder) - lib_folder = os.path.join(env_path, "lib") - include_folder = os.path.join(env_path, "include") - library_header_path = os.path.join(include_folder, library_header) - library_found = os.path.isfile(library_header_path) - conda_installed = library_found - - if not library_found: - if sys.platform == "linux": - library_found = os.path.exists(f"/usr/include/{library_header}") - library_found = library_found or os.path.exists(f"/usr/local/include/{library_header}") - - return library_found, conda_installed, include_folder, lib_folder - - -def get_extensions(): - this_dir = os.path.dirname(os.path.abspath(__file__)) - extensions_dir = os.path.join(this_dir, "torchvision", "csrc") - - main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + glob.glob( - os.path.join(extensions_dir, "ops", "*.cpp") - ) - source_cpu = ( - glob.glob(os.path.join(extensions_dir, "ops", "autograd", "*.cpp")) - + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp")) - + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp")) - ) - - print("Compiling extensions with following flags:") - compile_cpp_tests = os.getenv("WITH_CPP_MODELS_TEST", "0") == "1" - print(f" WITH_CPP_MODELS_TEST: {compile_cpp_tests}") - force_cuda = os.getenv("FORCE_CUDA", "0") == "1" - print(f" FORCE_CUDA: {force_cuda}") - debug_mode = os.getenv("DEBUG", "0") == "1" - print(f" DEBUG: {debug_mode}") - use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1" - print(f" TORCHVISION_USE_PNG: {use_png}") - use_jpeg = os.getenv("TORCHVISION_USE_JPEG", "1") == "1" - print(f" TORCHVISION_USE_JPEG: {use_jpeg}") - use_nvjpeg = os.getenv("TORCHVISION_USE_NVJPEG", "1") == "1" - print(f" TORCHVISION_USE_NVJPEG: {use_nvjpeg}") - use_ffmpeg = os.getenv("TORCHVISION_USE_FFMPEG", "1") == "1" - print(f" TORCHVISION_USE_FFMPEG: {use_ffmpeg}") - use_video_codec = os.getenv("TORCHVISION_USE_VIDEO_CODEC", "1") == "1" - print(f" TORCHVISION_USE_VIDEO_CODEC: {use_video_codec}") - - nvcc_flags = os.getenv("NVCC_FLAGS", "") - print(f" NVCC_FLAGS: {nvcc_flags}") - - is_rocm_pytorch = False - - if torch.__version__ >= "1.5": - from torch.utils.cpp_extension import ROCM_HOME - - is_rocm_pytorch = (torch.version.hip is not None) and (ROCM_HOME is not None) - - if is_rocm_pytorch: - from torch.utils.hipify import hipify_python - - hipify_python.hipify( - project_directory=this_dir, - output_directory=this_dir, - includes="torchvision/csrc/ops/cuda/*", - show_detailed=True, - is_pytorch_extension=True, - ) - source_cuda = glob.glob(os.path.join(extensions_dir, "ops", "hip", "*.hip")) - # Copy over additional files - for file in glob.glob(r"torchvision/csrc/ops/cuda/*.h"): - shutil.copy(file, "torchvision/csrc/ops/hip") - else: - source_cuda = glob.glob(os.path.join(extensions_dir, "ops", "cuda", "*.cu")) - - source_cuda += glob.glob(os.path.join(extensions_dir, "ops", "autocast", "*.cpp")) +def get_requirements(): + def get_dist(pkgname): + try: + return get_distribution(pkgname) + except DistributionNotFound: + return None + + pytorch_dep = os.getenv("TORCH_PACKAGE_NAME", "torch") + if version_pin := os.getenv("PYTORCH_VERSION"): + pytorch_dep += "==" + version_pin + elif (version_pin_ge := os.getenv("PYTORCH_VERSION_GE")) and (version_pin_lt := os.getenv("PYTORCH_VERSION_LT")): + # This branch and the associated env vars exist to help third-party + # builds like in https://github.com/pytorch/vision/pull/8936. This is + # supported on a best-effort basis, we don't guarantee that this won't + # eventually break (and we don't test it.) + pytorch_dep += f">={version_pin_ge},<{version_pin_lt}" + + requirements = [ + "numpy", + pytorch_dep, + ] - sources = main_file + source_cpu - extension = CppExtension + # Excluding 8.3.* because of https://github.com/pytorch/vision/issues/4934 + pillow_ver = " >= 5.3.0, !=8.3.*" + pillow_req = "pillow-simd" if get_dist("pillow-simd") is not None else "pillow" + requirements.append(pillow_req + pillow_ver) - if compile_cpp_tests: - print("Compiling CPP tests") - test_dir = os.path.join(this_dir, "test") - models_dir = os.path.join(this_dir, "torchvision", "csrc", "models") - test_file = glob.glob(os.path.join(test_dir, "*.cpp")) - source_models = glob.glob(os.path.join(models_dir, "*.cpp")) + return requirements - test_file = [os.path.join(test_dir, s) for s in test_file] - source_models = [os.path.join(models_dir, s) for s in source_models] - tests = test_file + source_models - tests_include_dirs = [test_dir, models_dir] +def get_macros_and_flags(): define_macros = [] - extra_compile_args = {"cxx": []} - if (torch.cuda.is_available() and ((CUDA_HOME is not None) or is_rocm_pytorch)) or force_cuda: - extension = CUDAExtension - sources += source_cuda - if not is_rocm_pytorch: + if BUILD_CUDA_SOURCES: + if IS_ROCM: + define_macros += [("WITH_HIP", None)] + nvcc_flags = [] + else: define_macros += [("WITH_CUDA", None)] - if nvcc_flags == "": + if NVCC_FLAGS is None: nvcc_flags = [] else: - nvcc_flags = nvcc_flags.split(" ") - else: - define_macros += [("WITH_HIP", None)] - nvcc_flags = [] + nvcc_flags = shlex.split(NVCC_FLAGS) extra_compile_args["nvcc"] = nvcc_flags if sys.platform == "win32": define_macros += [("torchvision_EXPORTS", None)] - define_macros += [("USE_PYTHON", None)] extra_compile_args["cxx"].append("/MP") - if debug_mode: - print("Compiling in debug mode") + if DEBUG: extra_compile_args["cxx"].append("-g") extra_compile_args["cxx"].append("-O0") if "nvcc" in extra_compile_args: @@ -233,162 +146,249 @@ def get_extensions(): extra_compile_args["nvcc"] = [f for f in nvcc_flags if not ("-O" in f or "-g" in f)] extra_compile_args["nvcc"].append("-O0") extra_compile_args["nvcc"].append("-g") + else: + extra_compile_args["cxx"].append("-g0") - sources = [os.path.join(extensions_dir, s) for s in sources] + return define_macros, extra_compile_args - include_dirs = [extensions_dir] - ext_modules = [ - extension( - "torchvision._C", - sorted(sources), - include_dirs=include_dirs, - define_macros=define_macros, - extra_compile_args=extra_compile_args, - ) - ] - if compile_cpp_tests: - ext_modules.append( - extension( - "torchvision._C_tests", - tests, - include_dirs=tests_include_dirs, - define_macros=define_macros, - extra_compile_args=extra_compile_args, - ) - ) +def make_C_extension(): + print("Building _C extension") - # ------------------- Torchvision extra extensions ------------------------ - vision_include = os.environ.get("TORCHVISION_INCLUDE", None) - vision_library = os.environ.get("TORCHVISION_LIBRARY", None) - vision_include = vision_include.split(os.pathsep) if vision_include is not None else [] - vision_library = vision_library.split(os.pathsep) if vision_library is not None else [] - include_dirs += vision_include - library_dirs = vision_library + sources = ( + list(CSRS_DIR.glob("*.cpp")) + + list(CSRS_DIR.glob("ops/*.cpp")) + + list(CSRS_DIR.glob("ops/autocast/*.cpp")) + + list(CSRS_DIR.glob("ops/autograd/*.cpp")) + + list(CSRS_DIR.glob("ops/cpu/*.cpp")) + + list(CSRS_DIR.glob("ops/quantized/cpu/*.cpp")) + ) + mps_sources = list(CSRS_DIR.glob("ops/mps/*.mm")) - # Image reading extension - image_macros = [] - image_include = [extensions_dir] - image_library = [] - image_link_flags = [] + if IS_ROCM: + from torch.utils.hipify import hipify_python - if sys.platform == "win32": - image_macros += [("USE_PYTHON", None)] - - # Locating libPNG - libpng = shutil.which("libpng-config") - pngfix = shutil.which("pngfix") - png_found = libpng is not None or pngfix is not None - - use_png = use_png and png_found - if use_png: - print("Found PNG library") - if libpng is not None: - # Linux / Mac - min_version = "1.6.0" - png_version = subprocess.run([libpng, "--version"], stdout=subprocess.PIPE) - png_version = png_version.stdout.strip().decode("utf-8") - png_version = parse_version(png_version) - if png_version >= parse_version(min_version): - print("Building torchvision with PNG image support") - png_lib = subprocess.run([libpng, "--libdir"], stdout=subprocess.PIPE) - png_lib = png_lib.stdout.strip().decode("utf-8") - if "disabled" not in png_lib: - image_library += [png_lib] - png_include = subprocess.run([libpng, "--I_opts"], stdout=subprocess.PIPE) - png_include = png_include.stdout.strip().decode("utf-8") - _, png_include = png_include.split("-I") - image_include += [png_include] - image_link_flags.append("png") - print(f" libpng version: {png_version}") - print(f" libpng include path: {png_include}") - else: - print("Could not add PNG image support to torchvision:") - print(f" libpng minimum version {min_version}, found {png_version}") - use_png = False - else: - # Windows - png_lib = os.path.join(os.path.dirname(os.path.dirname(pngfix)), "lib") - png_include = os.path.join(os.path.dirname(os.path.dirname(pngfix)), "include", "libpng16") - image_library += [png_lib] - image_include += [png_include] - image_link_flags.append("libpng") + hipify_python.hipify( + project_directory=str(ROOT_DIR), + output_directory=str(ROOT_DIR), + includes="torchvision/csrc/ops/cuda/*", + show_detailed=True, + is_pytorch_extension=True, + ) + cuda_sources = list(CSRS_DIR.glob("ops/hip/*.hip")) + for header in CSRS_DIR.glob("ops/cuda/*.h"): + shutil.copy(str(header), str(CSRS_DIR / "ops/hip")) else: - print("Building torchvision without PNG image support") - image_macros += [("PNG_FOUND", str(int(use_png)))] - - # Locating libjpeg - (jpeg_found, jpeg_conda, jpeg_include, jpeg_lib) = find_library("jpeglib", vision_include) - - use_jpeg = use_jpeg and jpeg_found - if use_jpeg: - print("Building torchvision with JPEG image support") - image_link_flags.append("jpeg") - if jpeg_conda: - image_library += [jpeg_lib] - image_include += [jpeg_include] + cuda_sources = list(CSRS_DIR.glob("ops/cuda/*.cu")) + + if BUILD_CUDA_SOURCES: + Extension = CUDAExtension + sources += cuda_sources else: - print("Building torchvision without JPEG image support") - image_macros += [("JPEG_FOUND", str(int(use_jpeg)))] - - # Locating nvjpeg - # Should be included in CUDA_HOME for CUDA >= 10.1, which is the minimum version we have in the CI - nvjpeg_found = ( - extension is CUDAExtension - and CUDA_HOME is not None - and os.path.exists(os.path.join(CUDA_HOME, "include", "nvjpeg.h")) + Extension = CppExtension + if torch.backends.mps.is_available() or FORCE_MPS: + sources += mps_sources + + define_macros, extra_compile_args = get_macros_and_flags() + return Extension( + name="torchvision._C", + sources=sorted(str(s) for s in sources), + include_dirs=[CSRS_DIR], + define_macros=define_macros, + extra_compile_args=extra_compile_args, ) - use_nvjpeg = use_nvjpeg and nvjpeg_found - if use_nvjpeg: - print("Building torchvision with NVJPEG image support") - image_link_flags.append("nvjpeg") + +def find_libpng(): + # Returns (found, include dir, library dir, library name) + if sys.platform in ("linux", "darwin", "aix"): + libpng_config = shutil.which("libpng-config") + if libpng_config is None: + warnings.warn("libpng-config not found") + return False, None, None, None + min_version = parse_version("1.6.0") + png_version = parse_version( + subprocess.run([libpng_config, "--version"], stdout=subprocess.PIPE).stdout.strip().decode("utf-8") + ) + if png_version < min_version: + warnings.warn(f"libpng version {png_version} is less than minimum required version {min_version}") + return False, None, None, None + + include_dir = ( + subprocess.run([libpng_config, "--I_opts"], stdout=subprocess.PIPE) + .stdout.strip() + .decode("utf-8") + .split("-I")[1] + ) + library_dir = subprocess.run([libpng_config, "--libdir"], stdout=subprocess.PIPE).stdout.strip().decode("utf-8") + library = "png" + else: # Windows + pngfix = shutil.which("pngfix") + if pngfix is None: + warnings.warn("pngfix not found") + return False, None, None, None + pngfix_dir = Path(pngfix).absolute().parent.parent + + library_dir = str(pngfix_dir / "lib") + include_dir = str(pngfix_dir / "include/libpng16") + library = "libpng" + + return True, include_dir, library_dir, library + + +def find_library(header): + # returns (found, include dir, library dir) + # if include dir or library dir is None, it means that the library is in + # standard paths and don't need to be added to compiler / linker search + # paths + + searching_for = f"Searching for {header}" + + for folder in TORCHVISION_INCLUDE: + if (Path(folder) / header).exists(): + print(f"{searching_for} in {Path(folder) / header}. Found in TORCHVISION_INCLUDE.") + return True, None, None + print(f"{searching_for}. Didn't find in TORCHVISION_INCLUDE.") + + # Try conda-related prefixes. If BUILD_PREFIX is set it means conda-build is + # being run. If CONDA_PREFIX is set then we're in a conda environment. + for prefix_env_var in ("BUILD_PREFIX", "CONDA_PREFIX"): + if (prefix := os.environ.get(prefix_env_var)) is not None: + prefix = Path(prefix) + if sys.platform == "win32": + prefix = prefix / "Library" + include_dir = prefix / "include" + library_dir = prefix / "lib" + if (include_dir / header).exists(): + print(f"{searching_for}. Found in {prefix_env_var}.") + return True, str(include_dir), str(library_dir) + print(f"{searching_for}. Didn't find in {prefix_env_var}.") + + if sys.platform == "linux": + for prefix in (Path("/usr/include"), Path("/usr/local/include")): + if (prefix / header).exists(): + print(f"{searching_for}. Found in {prefix}.") + return True, None, None + print(f"{searching_for}. Didn't find in {prefix}") + + if sys.platform == "darwin": + HOMEBREW_PATH = Path("/opt/homebrew") + include_dir = HOMEBREW_PATH / "include" + library_dir = HOMEBREW_PATH / "lib" + if (include_dir / header).exists(): + print(f"{searching_for}. Found in {include_dir}.") + return True, str(include_dir), str(library_dir) + + return False, None, None + + +def make_image_extension(): + print("Building image extension") + + include_dirs = TORCHVISION_INCLUDE.copy() + library_dirs = TORCHVISION_LIBRARY.copy() + + libraries = [] + define_macros, extra_compile_args = get_macros_and_flags() + + image_dir = CSRS_DIR / "io/image" + sources = list(image_dir.glob("*.cpp")) + list(image_dir.glob("cpu/*.cpp")) + list(image_dir.glob("cpu/giflib/*.c")) + + if IS_ROCM: + sources += list(image_dir.glob("hip/*.cpp")) + # we need to exclude this in favor of the hipified source + sources.remove(image_dir / "image.cpp") else: - print("Building torchvision without NVJPEG image support") - image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))] - - image_path = os.path.join(extensions_dir, "io", "image") - image_src = ( - glob.glob(os.path.join(image_path, "*.cpp")) - + glob.glob(os.path.join(image_path, "cpu", "*.cpp")) - + glob.glob(os.path.join(image_path, "cuda", "*.cpp")) + sources += list(image_dir.glob("cuda/*.cpp")) + + Extension = CppExtension + + if USE_PNG: + png_found, png_include_dir, png_library_dir, png_library = find_libpng() + if png_found: + print("Building torchvision with PNG support") + print(f"{png_include_dir = }") + print(f"{png_library_dir = }") + include_dirs.append(png_include_dir) + library_dirs.append(png_library_dir) + libraries.append(png_library) + define_macros += [("PNG_FOUND", 1)] + else: + warnings.warn("Building torchvision without PNG support") + + if USE_JPEG: + jpeg_found, jpeg_include_dir, jpeg_library_dir = find_library(header="jpeglib.h") + if jpeg_found: + print("Building torchvision with JPEG support") + print(f"{jpeg_include_dir = }") + print(f"{jpeg_library_dir = }") + if jpeg_include_dir is not None and jpeg_library_dir is not None: + # if those are None it means they come from standard paths that are already in the search paths, which we don't need to re-add. + include_dirs.append(jpeg_include_dir) + library_dirs.append(jpeg_library_dir) + libraries.append("jpeg") + define_macros += [("JPEG_FOUND", 1)] + else: + warnings.warn("Building torchvision without JPEG support") + + if USE_WEBP: + webp_found, webp_include_dir, webp_library_dir = find_library(header="webp/decode.h") + if webp_found: + print("Building torchvision with WEBP support") + print(f"{webp_include_dir = }") + print(f"{webp_library_dir = }") + if webp_include_dir is not None and webp_library_dir is not None: + # if those are None it means they come from standard paths that are already in the search paths, which we don't need to re-add. + include_dirs.append(webp_include_dir) + library_dirs.append(webp_library_dir) + webp_library = "libwebp" if sys.platform == "win32" else "webp" + libraries.append(webp_library) + define_macros += [("WEBP_FOUND", 1)] + else: + warnings.warn("Building torchvision without WEBP support") + + if USE_NVJPEG and (torch.cuda.is_available() or FORCE_CUDA): + nvjpeg_found = CUDA_HOME is not None and (Path(CUDA_HOME) / "include/nvjpeg.h").exists() + + if nvjpeg_found: + print("Building torchvision with NVJPEG image support") + libraries.append("nvjpeg") + define_macros += [("NVJPEG_FOUND", 1)] + Extension = CUDAExtension + else: + warnings.warn("Building torchvision without NVJPEG support") + elif USE_NVJPEG: + warnings.warn("Building torchvision without NVJPEG support") + + return Extension( + name="torchvision.image", + sources=sorted(str(s) for s in sources), + include_dirs=include_dirs, + library_dirs=library_dirs, + define_macros=define_macros, + libraries=libraries, + extra_compile_args=extra_compile_args, ) - if use_png or use_jpeg: - ext_modules.append( - extension( - "torchvision.image", - image_src, - include_dirs=image_include + include_dirs + [image_path], - library_dirs=image_library + library_dirs, - define_macros=image_macros, - libraries=image_link_flags, - extra_compile_args=extra_compile_args, - ) - ) - # Locating ffmpeg - ffmpeg_exe = shutil.which("ffmpeg") - has_ffmpeg = ffmpeg_exe is not None - ffmpeg_version = None - # FIXME: Building torchvision with ffmpeg on MacOS or with Python 3.9 - # FIXME: causes crash. See the following GitHub issues for more details. - # FIXME: https://github.com/pytorch/pytorch/issues/65000 - # FIXME: https://github.com/pytorch/vision/issues/3367 +def make_video_decoders_extensions(): + print("Building video decoder extensions") + + build_without_extensions_msg = "Building without video decoders extensions." if sys.platform != "linux" or (sys.version_info.major == 3 and sys.version_info.minor == 9): - has_ffmpeg = False - if has_ffmpeg: - try: - # This is to check if ffmpeg is installed properly. - ffmpeg_version = subprocess.check_output(["ffmpeg", "-version"]) - except subprocess.CalledProcessError: - print("Building torchvision without ffmpeg support") - print(" Error fetching ffmpeg version, ignoring ffmpeg.") - has_ffmpeg = False + # FIXME: Building torchvision with ffmpeg on MacOS or with Python 3.9 + # FIXME: causes crash. See the following GitHub issues for more details. + # FIXME: https://github.com/pytorch/pytorch/issues/65000 + # FIXME: https://github.com/pytorch/vision/issues/3367 + print("Can only build video decoder extensions on linux and Python != 3.9") + return [] - use_ffmpeg = use_ffmpeg and has_ffmpeg + ffmpeg_exe = shutil.which("ffmpeg") + if ffmpeg_exe is None: + print(f"{build_without_extensions_msg} Couldn't find ffmpeg binary.") + return [] - if use_ffmpeg: + def find_ffmpeg_libraries(): ffmpeg_libraries = {"libavcodec", "libavformat", "libavutil", "libswresample", "libswscale"} ffmpeg_bin = os.path.dirname(ffmpeg_exe) @@ -410,49 +410,56 @@ def get_extensions(): for library in ffmpeg_libraries: library_found = False - for search_path in ffmpeg_include_dir + include_dirs: + for search_path in ffmpeg_include_dir + TORCHVISION_INCLUDE: full_path = os.path.join(search_path, library, "*.h") library_found |= len(glob.glob(full_path)) > 0 if not library_found: - print("Building torchvision without ffmpeg support") - print(f" {library} header files were not found, disabling ffmpeg support") - use_ffmpeg = False - else: - print("Building torchvision without ffmpeg support") + print(f"{build_without_extensions_msg}") + print(f"{library} header files were not found.") + return None, None + + return ffmpeg_include_dir, ffmpeg_library_dir + + ffmpeg_include_dir, ffmpeg_library_dir = find_ffmpeg_libraries() + if ffmpeg_include_dir is None or ffmpeg_library_dir is None: + return [] + + print("Found ffmpeg:") + print(f" ffmpeg include path: {ffmpeg_include_dir}") + print(f" ffmpeg library_dir: {ffmpeg_library_dir}") - if use_ffmpeg: - print("Building torchvision with ffmpeg support") - print(f" ffmpeg version: {ffmpeg_version}") - print(f" ffmpeg include path: {ffmpeg_include_dir}") - print(f" ffmpeg library_dir: {ffmpeg_library_dir}") + extensions = [] + if USE_CPU_VIDEO_DECODER: + print("Building with CPU video decoder support") # TorchVision base decoder + video reader - video_reader_src_dir = os.path.join(this_dir, "torchvision", "csrc", "io", "video_reader") + video_reader_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video_reader") video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp")) - base_decoder_src_dir = os.path.join(this_dir, "torchvision", "csrc", "io", "decoder") + base_decoder_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "decoder") base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "*.cpp")) # Torchvision video API - videoapi_src_dir = os.path.join(this_dir, "torchvision", "csrc", "io", "video") + videoapi_src_dir = os.path.join(ROOT_DIR, "torchvision", "csrc", "io", "video") videoapi_src = glob.glob(os.path.join(videoapi_src_dir, "*.cpp")) # exclude tests base_decoder_src = [x for x in base_decoder_src if "_test.cpp" not in x] combined_src = video_reader_src + base_decoder_src + videoapi_src - ext_modules.append( + extensions.append( CppExtension( + # This is an awful name. It should be "cpu_video_decoder". Keeping for BC. "torchvision.video_reader", combined_src, include_dirs=[ base_decoder_src_dir, video_reader_src_dir, videoapi_src_dir, - extensions_dir, + str(CSRS_DIR), *ffmpeg_include_dir, - *include_dirs, + *TORCHVISION_INCLUDE, ], - library_dirs=ffmpeg_library_dir + library_dirs, + library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY, libraries=[ "avcodec", "avformat", @@ -460,41 +467,41 @@ def get_extensions(): "swresample", "swscale", ], - extra_compile_args=["-std=c++14"] if os.name != "nt" else ["/std:c++14", "/MP"], - extra_link_args=["-std=c++14" if os.name != "nt" else "/std:c++14"], + extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"], + extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"], ) ) - # Locating video codec - # CUDA_HOME should be set to the cuda root directory. - # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the location to - # video codec header files and libraries respectively. - video_codec_found = ( - extension is CUDAExtension - and CUDA_HOME is not None - and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in vision_include]) - and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in vision_include]) - and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in library_dirs]) - ) + if USE_GPU_VIDEO_DECODER: + # Locating GPU video decoder headers and libraries + # CUDA_HOME should be set to the cuda root directory. + # TORCHVISION_INCLUDE and TORCHVISION_LIBRARY should include the locations + # to the headers and libraries below + if not ( + BUILD_CUDA_SOURCES + and CUDA_HOME is not None + and any([os.path.exists(os.path.join(folder, "cuviddec.h")) for folder in TORCHVISION_INCLUDE]) + and any([os.path.exists(os.path.join(folder, "nvcuvid.h")) for folder in TORCHVISION_INCLUDE]) + and any([os.path.exists(os.path.join(folder, "libnvcuvid.so")) for folder in TORCHVISION_LIBRARY]) + and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir]) + ): + print("Could not find necessary dependencies. Refer the setup.py to check which ones are needed.") + print("Building without GPU video decoder support") + return extensions + print("Building torchvision with GPU video decoder support") - use_video_codec = use_video_codec and video_codec_found - if ( - use_video_codec - and use_ffmpeg - and any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir]) - ): - print("Building torchvision with video codec support") - gpu_decoder_path = os.path.join(extensions_dir, "io", "decoder", "gpu") + gpu_decoder_path = os.path.join(CSRS_DIR, "io", "decoder", "gpu") gpu_decoder_src = glob.glob(os.path.join(gpu_decoder_path, "*.cpp")) cuda_libs = os.path.join(CUDA_HOME, "lib64") cuda_inc = os.path.join(CUDA_HOME, "include") - ext_modules.append( - extension( - "torchvision.Decoder", + _, extra_compile_args = get_macros_and_flags() + extensions.append( + CUDAExtension( + "torchvision.gpu_decoder", gpu_decoder_src, - include_dirs=include_dirs + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir, - library_dirs=ffmpeg_library_dir + library_dirs + [cuda_libs], + include_dirs=[CSRS_DIR] + TORCHVISION_INCLUDE + [gpu_decoder_path] + [cuda_inc] + ffmpeg_include_dir, + library_dirs=ffmpeg_library_dir + TORCHVISION_LIBRARY + [cuda_libs], libraries=[ "avcodec", "avformat", @@ -512,20 +519,8 @@ def get_extensions(): extra_compile_args=extra_compile_args, ) ) - else: - print("Building torchvision without video codec support") - if ( - use_video_codec - and use_ffmpeg - and not any([os.path.exists(os.path.join(folder, "libavcodec", "bsf.h")) for folder in ffmpeg_include_dir]) - ): - print( - " The installed version of ffmpeg is missing the header file 'bsf.h' which is " - " required for GPU video decoding. Please install the latest ffmpeg from conda-forge channel:" - " `conda install -c conda-forge ffmpeg`." - ) - return ext_modules + return extensions class clean(distutils.command.clean.clean): @@ -544,15 +539,21 @@ def run(self): if __name__ == "__main__": - print(f"Building wheel {package_name}-{version}") + version, sha = get_version() + write_version_file(version, sha) - write_version_file() + print(f"Building wheel {package_name}-{version}") - with open("README.rst") as f: + with open("README.md") as f: readme = f.read() + extensions = [ + make_C_extension(), + make_image_extension(), + *make_video_decoders_extensions(), + ] + setup( - # Metadata name=package_name, version=version, author="PyTorch Core Team", @@ -560,17 +561,18 @@ def run(self): url="https://github.com/pytorch/vision", description="image and video datasets and models for torch deep learning", long_description=readme, + long_description_content_type="text/markdown", license="BSD", - # Package info packages=find_packages(exclude=("test",)), package_data={package_name: ["*.dll", "*.dylib", "*.so", "prototype/datasets/_builtin/*.categories"]}, zip_safe=False, - install_requires=requirements, + install_requires=get_requirements(), extras_require={ + "gdown": ["gdown>=4.7.3"], "scipy": ["scipy"], }, - ext_modules=get_extensions(), - python_requires=">=3.7", + ext_modules=extensions, + python_requires=">=3.9", cmdclass={ "build_ext": BuildExtension.with_options(no_python_abi_suffix=True), "clean": clean, diff --git a/test/assets/fakedata/draw_boxes_different_label_colors.png b/test/assets/fakedata/draw_boxes_different_label_colors.png new file mode 100644 index 00000000000..72178930602 Binary files /dev/null and b/test/assets/fakedata/draw_boxes_different_label_colors.png differ diff --git a/test/assets/fakedata/draw_boxes_different_label_fill_colors.png b/test/assets/fakedata/draw_boxes_different_label_fill_colors.png new file mode 100644 index 00000000000..b15adb40092 Binary files /dev/null and b/test/assets/fakedata/draw_boxes_different_label_fill_colors.png differ diff --git a/test/assets/fakedata/draw_boxes_util.png b/test/assets/fakedata/draw_boxes_util.png index d38f8be78ac..ee5dac329e0 100644 Binary files a/test/assets/fakedata/draw_boxes_util.png and b/test/assets/fakedata/draw_boxes_util.png differ diff --git a/test/assets/fakedata/draw_keypoints_visibility.png b/test/assets/fakedata/draw_keypoints_visibility.png new file mode 100644 index 00000000000..8cd34f84539 Binary files /dev/null and b/test/assets/fakedata/draw_keypoints_visibility.png differ diff --git a/test/assets/fakedata/draw_rotated_boxes.png b/test/assets/fakedata/draw_rotated_boxes.png new file mode 100644 index 00000000000..4e5a5eb5414 Binary files /dev/null and b/test/assets/fakedata/draw_rotated_boxes.png differ diff --git a/test/assets/fakedata/draw_rotated_boxes_fill.png b/test/assets/fakedata/draw_rotated_boxes_fill.png new file mode 100644 index 00000000000..474b771f04e Binary files /dev/null and b/test/assets/fakedata/draw_rotated_boxes_fill.png differ diff --git a/test/assets/fakedata/logos/rgb_pytorch.avif b/test/assets/fakedata/logos/rgb_pytorch.avif new file mode 100644 index 00000000000..ea1bb586957 Binary files /dev/null and b/test/assets/fakedata/logos/rgb_pytorch.avif differ diff --git a/test/assets/fakedata/logos/rgb_pytorch.webp b/test/assets/fakedata/logos/rgb_pytorch.webp new file mode 100644 index 00000000000..e594584d76d Binary files /dev/null and b/test/assets/fakedata/logos/rgb_pytorch.webp differ diff --git a/test/assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic b/test/assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic new file mode 100644 index 00000000000..4c29ac3c71c Binary files /dev/null and b/test/assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic differ diff --git a/test/assets/toosmall_png/heapbof.png b/test/assets/toosmall_png/heapbof.png new file mode 100644 index 00000000000..e720d183342 Binary files /dev/null and b/test/assets/toosmall_png/heapbof.png differ diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py index 8c5484a2823..ef5d5e1ec96 100644 --- a/test/builtin_dataset_mocks.py +++ b/test/builtin_dataset_mocks.py @@ -12,14 +12,14 @@ import random import shutil import unittest.mock -import warnings import xml.etree.ElementTree as ET from collections import Counter, defaultdict import numpy as np import pytest import torch -from datasets_utils import combinations_grid, create_image_file, create_image_folder, make_tar, make_zip +from common_utils import combinations_grid +from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip from torch.nn.functional import one_hot from torch.testing import make_tensor as _make_tensor from torchvision.prototype import datasets @@ -519,10 +519,22 @@ def imagenet(root, config): ] num_children = 1 synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5)) - with warnings.catch_warnings(): - # The warning is not for savemat, but rather for some internals savemet is using - warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning) - savemat(data_root / "meta.mat", dict(synsets=synsets)) + synsets = np.array( + synsets, + dtype=np.dtype( + [ + ("ILSVRC2012_ID", "O"), + ("WNID", "O"), + ("words", "O"), + ("gloss", "O"), + ("num_children", "O"), + ("children", "O"), + ("wordnet_height", "O"), + ("num_train_images", "O"), + ] + ), + ) + savemat(data_root / "meta.mat", dict(synsets=synsets)) make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz") else: # config["split"] == "test" @@ -661,15 +673,15 @@ class SBDMockData: _NUM_CATEGORIES = 20 @classmethod - def _make_split_files(cls, root_map): - ids_map = { - split: [f"2008_{idx:06d}" for idx in idcs] - for split, idcs in ( - ("train", [0, 1, 2]), - ("train_noval", [0, 2]), - ("val", [3]), - ) - } + def _make_split_files(cls, root_map, *, split): + splits_and_idcs = [ + ("train", [0, 1, 2]), + ("val", [3]), + ] + if split == "train_noval": + splits_and_idcs.append(("train_noval", [0, 2])) + + ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs} for split, ids in ids_map.items(): with open(root_map[split] / f"{split}.txt", "w") as fh: @@ -710,12 +722,14 @@ def _make_segmentation(cls, size): return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy() @classmethod - def generate(cls, root): + def generate(cls, root, *, split): archive_folder = root / "benchmark_RELEASE" dataset_folder = archive_folder / "dataset" dataset_folder.mkdir(parents=True, exist_ok=True) - ids, num_samples_map = cls._make_split_files(defaultdict(lambda: dataset_folder, {"train_noval": root})) + ids, num_samples_map = cls._make_split_files( + defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split + ) sizes = cls._make_anns_folder(dataset_folder, "cls", ids) create_image_folder( dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx] @@ -723,12 +737,12 @@ def generate(cls, root): make_tar(root, "benchmark.tgz", archive_folder, compression="gz") - return num_samples_map + return num_samples_map[split] @register_mock(configs=combinations_grid(split=("train", "val", "train_noval"))) def sbd(root, config): - return SBDMockData.generate(root)[config["split"]] + return SBDMockData.generate(root, split=config["split"]) @register_mock(configs=[dict()]) diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py new file mode 100644 index 00000000000..adb794f5db6 --- /dev/null +++ b/test/common_extended_utils.py @@ -0,0 +1,310 @@ +import os +from collections import defaultdict +from numbers import Number +from typing import Any + +import torch +from torch.utils._python_dispatch import TorchDispatchMode + +from torch.utils._pytree import tree_map + +from torchvision.models._api import Weights + +aten = torch.ops.aten +quantized = torch.ops.quantized + + +def get_shape(i): + if isinstance(i, torch.Tensor): + return i.shape + elif hasattr(i, "weight"): + return i.weight().shape + else: + raise ValueError(f"Unknown type {type(i)}") + + +def prod(x): + res = 1 + for i in x: + res *= i + return res + + +def matmul_flop(inputs: list[Any], outputs: list[Any]) -> Number: + """ + Count flops for matmul. + """ + # Inputs should be a list of length 2. + # Inputs contains the shapes of two matrices. + input_shapes = [get_shape(v) for v in inputs] + assert len(input_shapes) == 2, input_shapes + assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes + flop = prod(input_shapes[0]) * input_shapes[-1][-1] + return flop + + +def addmm_flop(inputs: list[Any], outputs: list[Any]) -> Number: + """ + Count flops for fully connected layers. + """ + # Count flop for nn.Linear + # inputs is a list of length 3. + input_shapes = [get_shape(v) for v in inputs[1:3]] + # input_shapes[0]: [batch size, input feature dimension] + # input_shapes[1]: [batch size, output feature dimension] + assert len(input_shapes[0]) == 2, input_shapes[0] + assert len(input_shapes[1]) == 2, input_shapes[1] + batch_size, input_dim = input_shapes[0] + output_dim = input_shapes[1][1] + flops = batch_size * input_dim * output_dim + return flops + + +def bmm_flop(inputs: list[Any], outputs: list[Any]) -> Number: + """ + Count flops for the bmm operation. + """ + # Inputs should be a list of length 2. + # Inputs contains the shapes of two tensor. + assert len(inputs) == 2, len(inputs) + input_shapes = [get_shape(v) for v in inputs] + n, c, t = input_shapes[0] + d = input_shapes[-1][-1] + flop = n * c * t * d + return flop + + +def conv_flop_count( + x_shape: list[int], + w_shape: list[int], + out_shape: list[int], + transposed: bool = False, +) -> Number: + """ + Count flops for convolution. Note only multiplication is + counted. Computation for addition and bias is ignored. + Flops for a transposed convolution are calculated as + flops = (x_shape[2:] * prod(w_shape) * batch_size). + Args: + x_shape (list(int)): The input shape before convolution. + w_shape (list(int)): The filter shape. + out_shape (list(int)): The output shape after convolution. + transposed (bool): is the convolution transposed + Returns: + int: the number of flops + """ + batch_size = x_shape[0] + conv_shape = (x_shape if transposed else out_shape)[2:] + flop = batch_size * prod(w_shape) * prod(conv_shape) + return flop + + +def conv_flop(inputs: list[Any], outputs: list[Any]): + """ + Count flops for convolution. + """ + x, w = inputs[:2] + x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0])) + transposed = inputs[6] + + return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed) + + +def quant_conv_flop(inputs: list[Any], outputs: list[Any]): + """ + Count flops for quantized convolution. + """ + x, w = inputs[:2] + x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0])) + + return conv_flop_count(x_shape, w_shape, out_shape, transposed=False) + + +def transpose_shape(shape): + return [shape[1], shape[0]] + list(shape[2:]) + + +def conv_backward_flop(inputs: list[Any], outputs: list[Any]): + grad_out_shape, x_shape, w_shape = (get_shape(i) for i in inputs[:3]) + output_mask = inputs[-1] + fwd_transposed = inputs[7] + flop_count = 0 + + if output_mask[0]: + grad_input_shape = get_shape(outputs[0]) + flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed) + if output_mask[1]: + grad_weight_shape = get_shape(outputs[1]) + flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed) + + return flop_count + + +def scaled_dot_product_flash_attention_flop(inputs: list[Any], outputs: list[Any]): + # FIXME: this needs to count the flops of this kernel + # https://github.com/pytorch/pytorch/blob/207b06d099def9d9476176a1842e88636c1f714f/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp#L52-L267 + return 0 + + +flop_mapping = { + aten.mm: matmul_flop, + aten.matmul: matmul_flop, + aten.addmm: addmm_flop, + aten.bmm: bmm_flop, + aten.convolution: conv_flop, + aten._convolution: conv_flop, + aten.convolution_backward: conv_backward_flop, + quantized.conv2d: quant_conv_flop, + quantized.conv2d_relu: quant_conv_flop, + aten._scaled_dot_product_flash_attention: scaled_dot_product_flash_attention_flop, +} + +unmapped_ops = set() + + +def normalize_tuple(x): + if not isinstance(x, tuple): + return (x,) + return x + + +class FlopCounterMode(TorchDispatchMode): + def __init__(self, model=None): + self.flop_counts = defaultdict(lambda: defaultdict(int)) + self.parents = ["Global"] + # global mod + if model is not None: + for name, module in dict(model.named_children()).items(): + module.register_forward_pre_hook(self.enter_module(name)) + module.register_forward_hook(self.exit_module(name)) + + def enter_module(self, name): + def f(module, inputs): + self.parents.append(name) + inputs = normalize_tuple(inputs) + out = self.create_backwards_pop(name)(*inputs) + return out + + return f + + def exit_module(self, name): + def f(module, inputs, outputs): + assert self.parents[-1] == name + self.parents.pop() + outputs = normalize_tuple(outputs) + return self.create_backwards_push(name)(*outputs) + + return f + + def create_backwards_push(self, name): + class PushState(torch.autograd.Function): + @staticmethod + def forward(ctx, *args): + args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args) + if len(args) == 1: + return args[0] + return args + + @staticmethod + def backward(ctx, *grad_outs): + self.parents.append(name) + return grad_outs + + return PushState.apply + + def create_backwards_pop(self, name): + class PopState(torch.autograd.Function): + @staticmethod + def forward(ctx, *args): + args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args) + if len(args) == 1: + return args[0] + return args + + @staticmethod + def backward(ctx, *grad_outs): + assert self.parents[-1] == name + self.parents.pop() + return grad_outs + + return PopState.apply + + def __enter__(self): + self.flop_counts.clear() + super().__enter__() + + def __exit__(self, *args): + # print(f"Total: {sum(self.flop_counts['Global'].values()) / 1e9} GFLOPS") + # for mod in self.flop_counts.keys(): + # print(f"Module: ", mod) + # for k, v in self.flop_counts[mod].items(): + # print(f"{k}: {v / 1e9} GFLOPS") + # print() + super().__exit__(*args) + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs if kwargs else {} + + out = func(*args, **kwargs) + func_packet = func._overloadpacket + if func_packet in flop_mapping: + flop_count = flop_mapping[func_packet](args, normalize_tuple(out)) + for par in self.parents: + self.flop_counts[par][func_packet] += flop_count + else: + unmapped_ops.add(func_packet) + + return out + + def get_flops(self): + return sum(self.flop_counts["Global"].values()) / 1e9 + + +def get_dims(module_name, height, width): + # detection models have curated input sizes + if module_name == "detection": + # we can feed a batch of 1 for detection model instead of a list of 1 image + dims = (3, height, width) + elif module_name == "video": + # hard-coding the time dimension to size 16 + dims = (1, 16, 3, height, width) + else: + dims = (1, 3, height, width) + + return dims + + +def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512): + module_name = model.__module__.split(".")[-2] + dims = get_dims(module_name=module_name, height=height, width=width) + + input_tensor = torch.randn(dims) + + # try: + preprocess = weight.transforms() + if module_name == "optical_flow": + inp = preprocess(input_tensor, input_tensor) + else: + # hack to enable mod(*inp) for optical_flow models + inp = [preprocess(input_tensor)] + + model.eval() + + flop_counter = FlopCounterMode(model) + with flop_counter: + # detection models expect a list of 3d tensors as inputs + if module_name == "detection": + model(inp) + else: + model(*inp) + + flops = flop_counter.get_flops() + + return round(flops, 3) + + +def get_file_size_mb(weight): + weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1]) + weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024 + + return round(weights_size_mb, 3) diff --git a/test/common_utils.py b/test/common_utils.py index 7017212fc67..74ad31fea72 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -1,35 +1,45 @@ import contextlib import functools +import itertools import os +import pathlib import random +import re import shutil +import sys import tempfile +import warnings +from subprocess import CalledProcessError, check_output, STDOUT import numpy as np +import PIL +import pytest import torch -from PIL import Image -from torchvision import io +import torch.testing -import __main__ # noqa: 401 +from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair +from torchvision import io, tv_tensors +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from torchvision.transforms.v2.functional import to_image, to_pil_image +from torchvision.utils import _Image_fromarray -IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true" +IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"]) IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1" CUDA_NOT_AVAILABLE_MSG = "CUDA device not available" -CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda." +MPS_NOT_AVAILABLE_MSG = "MPS device not available" +OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda." @contextlib.contextmanager def get_tmp_dir(src=None, **kwargs): - tmp_dir = tempfile.mkdtemp(**kwargs) - if src is not None: - os.rmdir(tmp_dir) - shutil.copytree(src, tmp_dir) - try: + with tempfile.TemporaryDirectory( + **kwargs, + ) as tmp_dir: + if src is not None: + shutil.copytree(src, tmp_dir) yield tmp_dir - finally: - shutil.rmtree(tmp_dir) def set_rng_seed(seed): @@ -107,18 +117,28 @@ def disable_console_output(): yield -def cpu_and_gpu(): +def cpu_and_cuda(): import pytest # noqa return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda)) +def cpu_and_cuda_and_mps(): + return cpu_and_cuda() + (pytest.param("mps", marks=pytest.mark.needs_mps),) + + def needs_cuda(test_func): import pytest # noqa return pytest.mark.needs_cuda(test_func) +def needs_mps(test_func): + import pytest # noqa + + return pytest.mark.needs_mps(test_func) + + def _create_data(height=3, width=3, channels=3, device="cpu"): # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture tensor = torch.randint(0, 256, (channels, height, width), dtype=torch.uint8, device=device) @@ -127,7 +147,7 @@ def _create_data(height=3, width=3, channels=3, device="cpu"): if channels == 1: mode = "L" data = data[..., 0] - pil_img = Image.fromarray(data, mode=mode) + pil_img = _Image_fromarray(data, mode=mode) return tensor, pil_img @@ -137,9 +157,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu return batch_tensor -assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0) - - def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None): names = [] for i in range(num_videos): @@ -160,6 +177,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None): def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None): + # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it np_pil_image = np.array(pil_image) if np_pil_image.ndim == 2: np_pil_image = np_pil_image[:, :, None] @@ -172,6 +190,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None): def _assert_approx_equal_tensor_to_pil( tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None ): + # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it # TODO: we could just merge this into _assert_equal_tensor_to_pil np_pil_image = np.array(pil_image) if np_pil_image.ndim == 2: @@ -202,3 +221,321 @@ def _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=1e-8, **fn_kwargs): # scriptable function test s_transformed_batch = scripted_fn(batch_tensors, **fn_kwargs) torch.testing.assert_close(transformed_batch, s_transformed_batch, rtol=1e-5, atol=scripted_fn_atol) + + +def cache(fn): + """Similar to :func:`functools.cache` (Python >= 3.8) or :func:`functools.lru_cache` with infinite cache size, + but this also caches exceptions. + """ + sentinel = object() + out_cache = {} + exc_tb_cache = {} + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + key = args + tuple(kwargs.values()) + + out = out_cache.get(key, sentinel) + if out is not sentinel: + return out + + exc_tb = exc_tb_cache.get(key, sentinel) + if exc_tb is not sentinel: + raise exc_tb[0].with_traceback(exc_tb[1]) + + try: + out = fn(*args, **kwargs) + except Exception as exc: + # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest + # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding + # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details. + exc_tb_cache[key] = exc, exc.__traceback__ + raise exc + + out_cache[key] = out + return out + + return wrapper + + +def combinations_grid(**kwargs): + """Creates a grid of input combinations. + + Each element in the returned sequence is a dictionary containing one possible combination as values. + + Example: + >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) + [ + {'foo': 'bar', 'spam': 'eggs'}, + {'foo': 'bar', 'spam': 'ham'}, + {'foo': 'baz', 'spam': 'eggs'}, + {'foo': 'baz', 'spam': 'ham'} + ] + """ + return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] + + +class ImagePair(TensorLikePair): + def __init__( + self, + actual, + expected, + *, + mae=False, + **other_parameters, + ): + if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]): + actual, expected = (to_image(input) for input in [actual, expected]) + + super().__init__(actual, expected, **other_parameters) + self.mae = mae + + def compare(self) -> None: + actual, expected = self.actual, self.expected + + self._compare_attributes(actual, expected) + actual, expected = self._equalize_attributes(actual, expected) + + if self.mae: + if actual.dtype is torch.uint8: + actual, expected = actual.to(torch.int), expected.to(torch.int) + mae = float(torch.abs(actual - expected).float().mean()) + if mae > self.atol: + self._fail( + AssertionError, + f"The MAE of the images is {mae}, but only {self.atol} is allowed.", + ) + else: + super()._compare_values(actual, expected) + + +def assert_close( + actual, + expected, + *, + allow_subclasses=True, + rtol=None, + atol=None, + equal_nan=False, + check_device=True, + check_dtype=True, + check_layout=True, + check_stride=False, + msg=None, + **kwargs, +): + """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison""" + __tracebackhide__ = True + + error_metas = not_close_error_metas( + actual, + expected, + pair_types=( + NonePair, + BooleanPair, + NumberPair, + ImagePair, + TensorLikePair, + ), + allow_subclasses=allow_subclasses, + rtol=rtol, + atol=atol, + equal_nan=equal_nan, + check_device=check_device, + check_dtype=check_dtype, + check_layout=check_layout, + check_stride=check_stride, + **kwargs, + ) + + if error_metas: + raise error_metas[0].to_error(msg) + + +assert_equal = functools.partial(assert_close, rtol=0, atol=0) + + +DEFAULT_SIZE = (17, 11) + + +NUM_CHANNELS_MAP = { + "GRAY": 1, + "GRAY_ALPHA": 2, + "RGB": 3, + "RGBA": 4, +} + + +def make_image( + size=DEFAULT_SIZE, + *, + color_space="RGB", + batch_dims=(), + dtype=None, + device="cpu", + memory_format=torch.contiguous_format, +): + num_channels = NUM_CHANNELS_MAP[color_space] + dtype = dtype or torch.uint8 + max_value = get_max_value(dtype) + data = torch.testing.make_tensor( + (*batch_dims, num_channels, *size), + low=0, + high=max_value, + dtype=dtype, + device=device, + memory_format=memory_format, + ) + if color_space in {"GRAY_ALPHA", "RGBA"}: + data[..., -1, :, :] = max_value + + return tv_tensors.Image(data) + + +def make_image_tensor(*args, **kwargs): + return make_image(*args, **kwargs).as_subclass(torch.Tensor) + + +def make_image_pil(*args, **kwargs): + return to_pil_image(make_image(*args, **kwargs)) + + +def make_keypoints(canvas_size=DEFAULT_SIZE, *, num_points=4, dtype=None, device="cpu"): + y = torch.randint(0, canvas_size[0], size=(num_points, 1), dtype=dtype, device=device) + x = torch.randint(0, canvas_size[1], size=(num_points, 1), dtype=dtype, device=device) + return tv_tensors.KeyPoints(torch.cat((x, y), dim=-1), canvas_size=canvas_size) + + +def make_bounding_boxes( + canvas_size=DEFAULT_SIZE, + *, + format=tv_tensors.BoundingBoxFormat.XYXY, + clamping_mode="soft", + num_boxes=1, + dtype=None, + device="cpu", +): + def sample_position(values, max_value): + # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high. + # However, if we have batch_dims, we need tensors as limits. + return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()]) + + if isinstance(format, str): + format = tv_tensors.BoundingBoxFormat[format] + + dtype = dtype or torch.float32 + + h, w = (torch.randint(1, s, (num_boxes,)) for s in canvas_size) + y = sample_position(h, canvas_size[0]) + x = sample_position(w, canvas_size[1]) + r = -360 * torch.rand((num_boxes,)) + 180 + + if format is tv_tensors.BoundingBoxFormat.XYWH: + parts = (x, y, w, h) + elif format is tv_tensors.BoundingBoxFormat.XYXY: + x1, y1 = x, y + x2 = x1 + w + y2 = y1 + h + parts = (x1, y1, x2, y2) + elif format is tv_tensors.BoundingBoxFormat.CXCYWH: + cx = x + w / 2 + cy = y + h / 2 + parts = (cx, cy, w, h) + elif format is tv_tensors.BoundingBoxFormat.XYWHR: + parts = (x, y, w, h, r) + elif format is tv_tensors.BoundingBoxFormat.CXCYWHR: + cx = x + w / 2 + cy = y + h / 2 + parts = (cx, cy, w, h, r) + elif format is tv_tensors.BoundingBoxFormat.XYXYXYXY: + r_rad = r * torch.pi / 180.0 + cos, sin = torch.cos(r_rad), torch.sin(r_rad) + x1 = x + y1 = y + x2 = x1 + w * cos + y2 = y1 - w * sin + x3 = x2 + h * sin + y3 = y2 + h * cos + x4 = x1 + h * sin + y4 = y1 + h * cos + parts = (x1, y1, x2, y2, x3, y3, x4, y4) + else: + raise ValueError(f"Format {format} is not supported") + out_boxes = torch.stack(parts, dim=-1).to(dtype=dtype, device=device) + return tv_tensors.BoundingBoxes(out_boxes, format=format, canvas_size=canvas_size, clamping_mode=clamping_mode) + + +def make_detection_masks(size=DEFAULT_SIZE, *, num_masks=1, dtype=None, device="cpu"): + """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" + return tv_tensors.Mask( + torch.testing.make_tensor( + (num_masks, *size), + low=0, + high=2, + dtype=dtype or torch.bool, + device=device, + ) + ) + + +def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"): + """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value""" + return tv_tensors.Mask( + torch.testing.make_tensor( + (*batch_dims, *size), + low=0, + high=num_categories, + dtype=dtype or torch.uint8, + device=device, + ) + ) + + +def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs): + return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) + + +def make_video_tensor(*args, **kwargs): + return make_video(*args, **kwargs).as_subclass(torch.Tensor) + + +def assert_run_python_script(source_code): + """Utility to check assertions in an independent Python subprocess. + + The script provided in the source code should return 0 and not print + anything on stderr or stdout. Modified from scikit-learn test utils. + + Args: + source_code (str): The Python source code to execute. + """ + with get_tmp_dir() as root: + path = pathlib.Path(root) / "main.py" + with open(path, "w") as file: + file.write(source_code) + + try: + out = check_output([sys.executable, str(path)], stderr=STDOUT) + except CalledProcessError as e: + raise RuntimeError(f"script errored with output:\n{e.output.decode()}") + if out != b"": + raise AssertionError(out.decode()) + + +@contextlib.contextmanager +def assert_no_warnings(): + # The name `catch_warnings` is a misnomer as the context manager does **not** catch any warnings, but rather scopes + # the warning filters. All changes that are made to the filters while in this context, will be reset upon exit. + with warnings.catch_warnings(): + warnings.simplefilter("error") + yield + + +@contextlib.contextmanager +def ignore_jit_no_profile_information_warning(): + # Calling a scripted object often triggers a warning like + # `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information` + # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore + # them. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning) + yield diff --git a/test/conftest.py b/test/conftest.py index 1a9b2db7f5c..a9768598ded 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,22 +3,32 @@ import numpy as np import pytest import torch -from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER + +from common_utils import ( + CUDA_NOT_AVAILABLE_MSG, + IN_FBCODE, + IN_OSS_CI, + IN_RE_WORKER, + MPS_NOT_AVAILABLE_MSG, + OSS_CI_GPU_NO_CUDA_MSG, +) def pytest_configure(config): # register an additional marker (see pytest_collection_modifyitems) config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device") + config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device") config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected") + config.addinivalue_line("markers", "opcheck_only_one: only opcheck one parametrization") def pytest_collection_modifyitems(items): # This hook is called by pytest after it has collected the tests (google its name to check out its doc!) # We can ignore some tests as we see fit here, or add marks, such as a skip mark. # - # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the + # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already. - # This is true for both CircleCI and the fbcode internal CI. + # This is true for both OSS CI and the fbcode internal CI. # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if # these tests never existed. @@ -28,16 +38,20 @@ def pytest_collection_modifyitems(items): # The needs_cuda mark will exist if the test was explicitly decorated with # the @needs_cuda decorator. It will also exist if it was parametrized with a # parameter that has the mark: for example if a test is parametrized with - # @pytest.mark.parametrize('device', cpu_and_gpu()) + # @pytest.mark.parametrize('device', cpu_and_cuda()) # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark, # and the ones with device == 'cpu' won't have the mark. needs_cuda = item.get_closest_marker("needs_cuda") is not None + needs_mps = item.get_closest_marker("needs_mps") is not None if needs_cuda and not torch.cuda.is_available(): # In general, we skip cuda tests on machines without a GPU # There are special cases though, see below item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG)) + if needs_mps and not torch.backends.mps.is_available(): + item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG)) + if IN_FBCODE: # fbcode doesn't like skipping tests, so instead we just don't collect the test # so that they don't even "exist", hence the continue statements. @@ -49,15 +63,18 @@ def pytest_collection_modifyitems(items): # TODO: something more robust would be to do that only in a sandcastle instance, # so that we can still see the test being skipped when testing locally from a devvm continue - elif IN_CIRCLE_CI: + if needs_mps and not torch.backends.mps.is_available(): + # Same as above, but for MPS + continue + elif IN_OSS_CI: # Here we're not in fbcode, so we can safely collect and skip tests. if not needs_cuda and torch.cuda.is_available(): - # Similar to what happens in RE workers: we don't need the CircleCI GPU machines + # Similar to what happens in RE workers: we don't need the OSS CI GPU machines # to run the CPU-only tests. - item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG)) + item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG)) if item.get_closest_marker("dont_collect") is not None: - # currently, this is only used for some tests we're sure we dont want to run on fbcode + # currently, this is only used for some tests we're sure we don't want to run on fbcode continue out_items.append(item) diff --git a/test/cpp/test_custom_operators.cpp b/test/cpp/test_custom_operators.cpp index e68f6c2f029..5178575d21b 100644 --- a/test/cpp/test_custom_operators.cpp +++ b/test/cpp/test_custom_operators.cpp @@ -7,7 +7,8 @@ TEST(test_custom_operators, nms) { // make sure that the torchvision ops are visible to the jit interpreter - auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::nms")); + auto& ops = torch::jit::getAllOperatorsFor( + torch::jit::Symbol::fromQualString("torchvision::nms")); ASSERT_EQ(ops.size(), 1); auto& op = ops.front(); @@ -24,29 +25,35 @@ TEST(test_custom_operators, nms) { at::Tensor output = vision::ops::nms(boxes, scores, thresh); ASSERT_TRUE(output_jit.allclose(output)); - } TEST(test_custom_operators, roi_align_visible) { - // make sure that the torchvision ops are visible to the jit interpreter even if - // not explicitly included - auto& ops = torch::jit::getAllOperatorsFor(torch::jit::Symbol::fromQualString("torchvision::roi_align")); + // make sure that the torchvision ops are visible to the jit interpreter even + // if not explicitly included + auto& ops = torch::jit::getAllOperatorsFor( + torch::jit::Symbol::fromQualString("torchvision::roi_align")); ASSERT_EQ(ops.size(), 1); auto& op = ops.front(); ASSERT_EQ(op->schema().name(), "torchvision::roi_align"); torch::jit::Stack stack; - float roi_data[] = { - 0., 0., 0., 5., 5., - 0., 5., 5., 10., 10. - }; - at::Tensor input = at::rand({1, 2, 10, 10}), rois = at::from_blob(roi_data, {2, 5}); + float roi_data[] = {0., 0., 0., 5., 5., 0., 5., 5., 10., 10.}; + at::Tensor input = at::rand({1, 2, 10, 10}), + rois = at::from_blob(roi_data, {2, 5}); double spatial_scale = 1.0; int64_t pooled_height = 3, pooled_width = 3, sampling_ratio = -1; bool aligned = true; - torch::jit::push(stack, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned); + torch::jit::push( + stack, + input, + rois, + spatial_scale, + pooled_height, + pooled_width, + sampling_ratio, + aligned); op->getOperation()(stack); at::Tensor output_jit; torch::jit::pop(stack, output_jit); diff --git a/test/datasets_utils.py b/test/datasets_utils.py index c232e7132b4..cbfb26b6c6b 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -5,6 +5,7 @@ import itertools import os import pathlib +import platform import random import shutil import string @@ -14,7 +15,8 @@ import unittest.mock import zipfile from collections import defaultdict -from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union +from collections.abc import Iterator, Sequence +from typing import Any, Callable, Optional, Union import numpy as np @@ -25,7 +27,12 @@ import torchvision.datasets import torchvision.io from common_utils import disable_console_output, get_tmp_dir +from torch.utils._pytree import tree_any +from torch.utils.data import DataLoader +from torchvision import tv_tensors +from torchvision.datasets import wrap_dataset_for_transforms_v2 from torchvision.transforms.functional import get_dimensions +from torchvision.transforms.v2.functional import get_size __all__ = [ @@ -57,6 +64,7 @@ class LazyImporter: provide modules listed in MODULES as attributes. They are only imported when accessed. """ + MODULES = ( "av", "lmdb", @@ -137,7 +145,7 @@ def test_foo(self, config): .. note:: - This will try to remove duplicate configurations. During this process it will not not preserve a potential + This will try to remove duplicate configurations. During this process it will not preserve a potential ordering of the configurations or an inner ordering of a configuration. """ @@ -146,7 +154,7 @@ def maybe_remove_duplicates(configs): return [dict(config_) for config_ in {tuple(sorted(config.items())) for config in configs}] except TypeError: # A TypeError will be raised if a value of any config is not hashable, e.g. a list. In that case duplicate - # removal would be a lot more elaborate and we simply bail out. + # removal would be a lot more elaborate, and we simply bail out. return configs @functools.wraps(test) @@ -169,23 +177,6 @@ def wrapper(self): return wrapper -def combinations_grid(**kwargs): - """Creates a grid of input combinations. - - Each element in the returned sequence is a dictionary containing one possible combination as values. - - Example: - >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) - [ - {'foo': 'bar', 'spam': 'eggs'}, - {'foo': 'bar', 'spam': 'ham'}, - {'foo': 'baz', 'spam': 'eggs'}, - {'foo': 'baz', 'spam': 'ham'} - ] - """ - return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] - - class DatasetTestCase(unittest.TestCase): """Abstract base class for all dataset testcases. @@ -291,13 +282,13 @@ def test_baz(self): "download_and_extract_archive", } - def dataset_args(self, tmpdir: str, config: Dict[str, Any]) -> Sequence[Any]: + def dataset_args(self, tmpdir: str, config: dict[str, Any]) -> Sequence[Any]: """Define positional arguments passed to the dataset. .. note:: The default behavior is only valid if the dataset to be tested has ``root`` as the only required parameter. - Otherwise you need to overwrite this method. + Otherwise, you need to overwrite this method. Args: tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset @@ -310,7 +301,7 @@ def dataset_args(self, tmpdir: str, config: Dict[str, Any]) -> Sequence[Any]: """ return (tmpdir,) - def inject_fake_data(self, tmpdir: str, config: Dict[str, Any]) -> Union[int, Dict[str, Any]]: + def inject_fake_data(self, tmpdir: str, config: dict[str, Any]) -> Union[int, dict[str, Any]]: """Inject fake data for dataset into a temporary directory. During the creation of the dataset the download and extract logic is disabled. Thus, the fake data injected @@ -334,11 +325,11 @@ def inject_fake_data(self, tmpdir: str, config: Dict[str, Any]) -> Union[int, Di @contextlib.contextmanager def create_dataset( self, - config: Optional[Dict[str, Any]] = None, + config: Optional[dict[str, Any]] = None, inject_fake_data: bool = True, patch_checks: Optional[bool] = None, **kwargs: Any, - ) -> Iterator[Tuple[torchvision.datasets.VisionDataset, Dict[str, Any]]]: + ) -> Iterator[tuple[torchvision.datasets.VisionDataset, dict[str, Any]]]: r"""Create the dataset in a temporary directory. The configuration passed to the dataset is populated to contain at least all parameters with default values. @@ -564,7 +555,7 @@ def test_feature_types(self, config): @test_all_configs def test_num_examples(self, config): with self.create_dataset(config) as (dataset, info): - assert len(dataset) == info["num_examples"] + assert len(list(dataset)) == len(dataset) == info["num_examples"] @test_all_configs def test_transforms(self, config): @@ -581,6 +572,39 @@ def test_transforms(self, config): mock.assert_called() + @test_all_configs + def test_transforms_v2_wrapper(self, config): + try: + with self.create_dataset(config) as (dataset, info): + for target_keys in [None, "all"]: + if target_keys is not None and self.DATASET_CLASS not in { + torchvision.datasets.CocoDetection, + torchvision.datasets.VOCDetection, + torchvision.datasets.Kitti, + torchvision.datasets.WIDERFace, + }: + with self.assertRaisesRegex(ValueError, "`target_keys` is currently only supported for"): + wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys) + continue + + wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys) + assert isinstance(wrapped_dataset, self.DATASET_CLASS) + assert len(wrapped_dataset) == info["num_examples"] + + wrapped_sample = wrapped_dataset[0] + assert tree_any( + lambda item: isinstance(item, (tv_tensors.TVTensor, PIL.Image.Image)), wrapped_sample + ) + except TypeError as error: + msg = f"No wrapper exists for dataset class {type(dataset).__name__}" + if str(error).startswith(msg): + pytest.skip(msg) + raise error + except RuntimeError as error: + if "currently not supported by this wrapper" in str(error): + pytest.skip("Config is currently not supported by this wrapper") + raise error + class ImageDatasetTestCase(DatasetTestCase): """Abstract base class for image dataset testcases. @@ -589,43 +613,56 @@ class ImageDatasetTestCase(DatasetTestCase): """ FEATURE_TYPES = (PIL.Image.Image, int) + SUPPORT_TV_IMAGE_DECODE: bool = False @contextlib.contextmanager def create_dataset( self, - config: Optional[Dict[str, Any]] = None, + config: Optional[dict[str, Any]] = None, inject_fake_data: bool = True, patch_checks: Optional[bool] = None, **kwargs: Any, - ) -> Iterator[Tuple[torchvision.datasets.VisionDataset, Dict[str, Any]]]: + ) -> Iterator[tuple[torchvision.datasets.VisionDataset, dict[str, Any]]]: with super().create_dataset( config=config, inject_fake_data=inject_fake_data, patch_checks=patch_checks, **kwargs, ) as (dataset, info): - # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access + # PIL.Image.open() only loads the image metadata upfront and keeps the file open until the first access # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we # force-load opened images. # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an # image, but never use the underlying data. During normal operation it is reasonable to assume that the # user wants to work with the image he just opened rather than deleting the underlying file. - with self._force_load_images(): + with self._force_load_images(loader=(config or {}).get("loader", None)): yield dataset, info @contextlib.contextmanager - def _force_load_images(self): - open = PIL.Image.open + def _force_load_images(self, loader: Optional[Callable[[str], Any]] = None): + open = loader or PIL.Image.open def new(fp, *args, **kwargs): image = open(fp, *args, **kwargs) - if isinstance(fp, (str, pathlib.Path)): + if isinstance(fp, (str, pathlib.Path)) and isinstance(image, PIL.Image.Image): image.load() return image - with unittest.mock.patch("PIL.Image.open", new=new): + with unittest.mock.patch(open.__module__ + "." + open.__qualname__, new=new): yield + def test_tv_decode_image_support(self): + if not self.SUPPORT_TV_IMAGE_DECODE: + pytest.skip(f"{self.DATASET_CLASS.__name__} does not support torchvision.io.decode_image.") + + with self.create_dataset( + config=dict( + loader=torchvision.io.decode_image, + ) + ) as (dataset, _): + image = dataset[0][0] + assert isinstance(image, torch.Tensor) + class VideoDatasetTestCase(DatasetTestCase): """Abstract base class for video dataset testcases. @@ -641,27 +678,76 @@ class VideoDatasetTestCase(DatasetTestCase): FEATURE_TYPES = (torch.Tensor, torch.Tensor, int) REQUIRED_PACKAGES = ("av",) - DEFAULT_FRAMES_PER_CLIP = 1 + FRAMES_PER_CLIP = 1 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.dataset_args = self._set_default_frames_per_clip(self.dataset_args) - def _set_default_frames_per_clip(self, inject_fake_data): + def _set_default_frames_per_clip(self, dataset_args): argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__) args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)] frames_per_clip_last = args_without_default[-1] == "frames_per_clip" - @functools.wraps(inject_fake_data) + @functools.wraps(dataset_args) def wrapper(tmpdir, config): - args = inject_fake_data(tmpdir, config) + args = dataset_args(tmpdir, config) if frames_per_clip_last and len(args) == len(args_without_default) - 1: - args = (*args, self.DEFAULT_FRAMES_PER_CLIP) + args = (*args, self.FRAMES_PER_CLIP) return args return wrapper + def test_output_format(self): + for output_format in ["TCHW", "THWC"]: + with self.create_dataset(output_format=output_format) as (dataset, _): + for video, *_ in dataset: + if output_format == "TCHW": + num_frames, num_channels, *_ = video.shape + else: # output_format == "THWC": + num_frames, *_, num_channels = video.shape + + assert num_frames == self.FRAMES_PER_CLIP + assert num_channels == 3 + + @test_all_configs + def test_transforms_v2_wrapper(self, config): + # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly + # or use the supported `"TCHW"` + if config.setdefault("output_format", "TCHW") == "THWC": + return + + super().test_transforms_v2_wrapper.__wrapped__(self, config) + + +def _no_collate(batch): + return batch + + +def check_transforms_v2_wrapper_spawn(dataset, expected_size): + # This check ensures that the wrapped datasets can be used with multiprocessing_context="spawn" in the DataLoader. + # We also check that transforms are applied correctly as a non-regression test for + # https://github.com/pytorch/vision/issues/8066 + # Implicitly, this also checks that the wrapped datasets are pickleable. + + # To save CI/test time, we only check on Windows where "spawn" is the default + if platform.system() != "Windows": + pytest.skip("Multiprocessing spawning is only checked on macOS.") + + wrapped_dataset = wrap_dataset_for_transforms_v2(dataset) + + dataloader = DataLoader(wrapped_dataset, num_workers=2, multiprocessing_context="spawn", collate_fn=_no_collate) + + def resize_was_applied(item): + # Checking the size of the output ensures that the Resize transform was correctly applied + return isinstance(item, (tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)) and get_size(item) == list( + expected_size + ) + + for wrapped_sample in dataloader: + assert tree_any(resize_was_applied, wrapped_sample) + def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor: r"""Create a random uint8 tensor. @@ -715,7 +801,7 @@ def create_image_folder( num_examples: int, size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, **kwargs: Any, -) -> List[pathlib.Path]: +) -> list[pathlib.Path]: """Create a folder of random images. Args: @@ -737,7 +823,7 @@ def create_image_folder( """ if size is None: - def size(idx: int) -> Tuple[int, int, int]: + def size(idx: int) -> tuple[int, int, int]: num_channels = 3 height, width = torch.randint(3, 11, size=(2,), dtype=torch.int).tolist() return (num_channels, height, width) @@ -786,7 +872,7 @@ def create_video_file( fps: float = 25, **kwargs: Any, ) -> pathlib.Path: - """Create an video file from random data. + """Create a video file from random data. Args: root (Union[str, pathlib.Path]): Root directory the video file will be placed in. @@ -829,7 +915,7 @@ def create_video_folder( size: Optional[Union[Sequence[int], int, Callable[[int], Union[Sequence[int], int]]]] = None, fps=25, **kwargs, -) -> List[pathlib.Path]: +) -> list[pathlib.Path]: """Create a folder of random videos. Args: @@ -951,7 +1037,7 @@ def create_random_string(length: int, *digits: str) -> str: Args: length (int): Number of characters in the generated string. - *characters (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`. + *digits (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`. """ if not digits: digits = string.ascii_lowercase diff --git a/test/expect/ModelTester.test_crestereo_base_expect.pkl b/test/expect/ModelTester.test_crestereo_base_expect.pkl new file mode 100644 index 00000000000..e5b8cd8f666 Binary files /dev/null and b/test/expect/ModelTester.test_crestereo_base_expect.pkl differ diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl index e95ba5f5398..862af2185c7 100644 Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl index c2875679efd..1d317eb7915 100644 Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl differ diff --git a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl index 0657261d96c..3d4e3e63f28 100644 Binary files a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl index 2f1ff941aba..54dfb7cd206 100644 Binary files a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl index 36b68081672..f52b77a8dd8 100644 Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl index c6d1fd14081..23e841bf874 100644 Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl differ diff --git a/test/expect/ModelTester.test_maxvit_t_expect.pkl b/test/expect/ModelTester.test_maxvit_t_expect.pkl new file mode 100644 index 00000000000..3a93545f614 Binary files /dev/null and b/test/expect/ModelTester.test_maxvit_t_expect.pkl differ diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl index 7fb8d66b080..f188ee7b911 100644 Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl index 9c74f2e9b99..beaf6c8e84b 100644 Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl differ diff --git a/test/expect/ModelTester.test_swin3d_b_expect.pkl b/test/expect/ModelTester.test_swin3d_b_expect.pkl new file mode 100644 index 00000000000..1efc513c911 Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_b_expect.pkl differ diff --git a/test/expect/ModelTester.test_swin3d_s_expect.pkl b/test/expect/ModelTester.test_swin3d_s_expect.pkl new file mode 100644 index 00000000000..0c1e594993e Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_s_expect.pkl differ diff --git a/test/expect/ModelTester.test_swin3d_t_expect.pkl b/test/expect/ModelTester.test_swin3d_t_expect.pkl new file mode 100644 index 00000000000..5e658ff16b7 Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_t_expect.pkl differ diff --git a/test/optests_failures_dict.json b/test/optests_failures_dict.json new file mode 100644 index 00000000000..3bad0bbb027 --- /dev/null +++ b/test/optests_failures_dict.json @@ -0,0 +1,5 @@ +{ + "_description": "This is a dict containing failures for tests autogenerated by generate_opcheck_tests. For more details, please see https://docs.google.com/document/d/1Pj5HRZvdOq3xpFpbEjUZp2hBovhy7Wnxw14m6lF2154/edit", + "_version": 1, + "data": {} +} diff --git a/test/smoke_test.py b/test/smoke_test.py index c3a4bdd19d6..e2a3b5068ab 100644 --- a/test/smoke_test.py +++ b/test/smoke_test.py @@ -1,4 +1,147 @@ +"""Run smoke tests""" + +import os +import sys +import sysconfig +from pathlib import Path + import torch import torchvision -import torchvision.datasets as dset -import torchvision.transforms +from torchvision.io import decode_avif, decode_heic, decode_image, decode_jpeg, read_file +from torchvision.models import resnet50, ResNet50_Weights + + +SCRIPT_DIR = Path(__file__).parent + + +def smoke_test_torchvision() -> None: + print( + "Is torchvision usable?", + all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]), + ) + + +def smoke_test_torchvision_read_decode() -> None: + img_jpg = decode_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")) + if img_jpg.shape != (3, 606, 517): + raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}") + + img_png = decode_image(str(SCRIPT_DIR / "assets" / "interlaced_png" / "wizard_low.png")) + if img_png.shape != (4, 471, 354): + raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}") + + img_webp = decode_image(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch.webp")) + if img_webp.shape != (3, 100, 100): + raise RuntimeError(f"Unexpected shape of img_webp: {img_webp.shape}") + + if sys.platform == "linux": + pass + # TODO: Fix/uncomment below (the TODO below is mostly accurate but we're + # still observing some failures on some CUDA jobs. Most are working.) + # if torch.cuda.is_available(): + # # TODO: For whatever reason this only passes on the runners that + # # support CUDA. + # # Strangely, on the CPU runners where this fails, the AVIF/HEIC + # # tests (ran with pytest) are passing. This is likely related to a + # # libcxx symbol thing, and the proper libstdc++.so get loaded only + # # with pytest? Ugh. + # img_avif = decode_avif(read_file(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch.avif"))) + # if img_avif.shape != (3, 100, 100): + # raise RuntimeError(f"Unexpected shape of img_avif: {img_avif.shape}") + + # img_heic = decode_heic( + # read_file(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic")) + # ) + # if img_heic.shape != (3, 100, 100): + # raise RuntimeError(f"Unexpected shape of img_heic: {img_heic.shape}") + else: + try: + decode_avif(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch.avif")) + except RuntimeError as e: + assert "torchvision-extra-decoders" in str(e) + + try: + decode_heic(str(SCRIPT_DIR / "assets/fakedata/logos/rgb_pytorch_incorrectly_encoded_but_who_cares.heic")) + except RuntimeError as e: + assert "torchvision-extra-decoders" in str(e) + + +def smoke_test_torchvision_decode_jpeg(device: str = "cpu"): + img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")) + img_jpg = decode_jpeg(img_jpg_data, device=device) + if img_jpg.shape != (3, 606, 517): + raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}") + + +def smoke_test_compile() -> None: + try: + model = resnet50().cuda() + model = torch.compile(model) + x = torch.randn(1, 3, 224, 224, device="cuda") + out = model(x) + print(f"torch.compile model output: {out.shape}") + except RuntimeError: + if sys.platform == "win32": + print("Successfully caught torch.compile RuntimeError on win") + else: + raise + + +def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None: + img = decode_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device) + + # Step 1: Initialize model with the best available weights + weights = ResNet50_Weights.DEFAULT + model = resnet50(weights=weights, progress=False).to(device) + model.eval() + + # Step 2: Initialize the inference transforms + preprocess = weights.transforms(antialias=True) + + # Step 3: Apply inference preprocessing transforms + batch = preprocess(img).unsqueeze(0) + + # Step 4: Use the model and print the predicted category + prediction = model(batch).squeeze(0).softmax(0) + class_id = prediction.argmax().item() + score = prediction[class_id].item() + category_name = weights.meta["categories"][class_id] + expected_category = "German shepherd" + print(f"{category_name} ({device}): {100 * score:.1f}%") + if category_name != expected_category: + raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}") + + +def main() -> None: + print(f"torchvision: {torchvision.__version__}") + print(f"torch.cuda.is_available: {torch.cuda.is_available()}") + + print(f"{torch.ops.image._jpeg_version() = }") + if not torch.ops.image._is_compiled_against_turbo(): + msg = "Torchvision wasn't compiled against libjpeg-turbo" + if os.getenv("IS_M1_CONDA_BUILD_JOB") == "1": + # When building the conda package on M1, it's difficult to enforce + # that we build against turbo due to interactions with the libwebp + # package. So we just accept it, instead of raising an error. + print(msg) + else: + raise ValueError(msg) + + smoke_test_torchvision() + smoke_test_torchvision_read_decode() + smoke_test_torchvision_resnet50_classify() + smoke_test_torchvision_decode_jpeg() + if torch.cuda.is_available(): + smoke_test_torchvision_decode_jpeg("cuda") + smoke_test_torchvision_resnet50_classify("cuda") + + # torch.compile is not supported on Python 3.14+ and Python built with GIL disabled + if sys.version_info < (3, 14, 0) and not sysconfig.get_config_var("Py_GIL_DISABLED"): + smoke_test_compile() + + if torch.backends.mps.is_available(): + smoke_test_torchvision_resnet50_classify("mps") + + +if __name__ == "__main__": + main() diff --git a/test/test_architecture_ops.py b/test/test_architecture_ops.py new file mode 100644 index 00000000000..32ad1a32f89 --- /dev/null +++ b/test/test_architecture_ops.py @@ -0,0 +1,46 @@ +import unittest + +import pytest +import torch + +from torchvision.models.maxvit import SwapAxes, WindowDepartition, WindowPartition + + +class MaxvitTester(unittest.TestCase): + def test_maxvit_window_partition(self): + input_shape = (1, 3, 224, 224) + partition_size = 7 + n_partitions = input_shape[3] // partition_size + + x = torch.randn(input_shape) + + partition = WindowPartition() + departition = WindowDepartition() + + x_hat = partition(x, partition_size) + x_hat = departition(x_hat, partition_size, n_partitions, n_partitions) + + torch.testing.assert_close(x, x_hat) + + def test_maxvit_grid_partition(self): + input_shape = (1, 3, 224, 224) + partition_size = 7 + n_partitions = input_shape[3] // partition_size + + x = torch.randn(input_shape) + pre_swap = SwapAxes(-2, -3) + post_swap = SwapAxes(-2, -3) + + partition = WindowPartition() + departition = WindowDepartition() + + x_hat = partition(x, n_partitions) + x_hat = pre_swap(x_hat) + x_hat = post_swap(x_hat) + x_hat = departition(x_hat, n_partitions, partition_size, partition_size) + + torch.testing.assert_close(x, x_hat) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py index 4fba3c3d098..8f6fc5d9278 100644 --- a/test/test_backbone_utils.py +++ b/test/test_backbone_utils.py @@ -1,6 +1,7 @@ import random +from collections.abc import Mapping, Sequence +from copy import deepcopy from itertools import chain -from typing import Mapping, Sequence import pytest import torch @@ -194,7 +195,7 @@ def test_feature_extraction_methods_equivalence(self): assert n1 == n2 assert p1.equal(p2) - # And that ouputs match + # And that outputs match with torch.no_grad(): ilg_out = ilg_model(self.inp) fgn_out = fx_model(self.inp) @@ -322,3 +323,14 @@ def forward(self, x): out = model(self.inp) # And backward out["leaf_module"].float().mean().backward() + + def test_deepcopy(self): + # Non-regression test for https://github.com/pytorch/vision/issues/8634 + model = models.efficientnet_b3(weights=None) + extractor = create_feature_extractor(model=model, return_nodes={"classifier.0": "out"}) + + extractor.eval() + extractor.train() + extractor = deepcopy(extractor) + extractor.eval() + extractor.train() diff --git a/test/test_cpp_models.py b/test/test_cpp_models.py deleted file mode 100644 index d8d0836d499..00000000000 --- a/test/test_cpp_models.py +++ /dev/null @@ -1,154 +0,0 @@ -import os -import sys -import unittest - -import torch -import torchvision.transforms.functional as F -from PIL import Image -from torchvision import models - -try: - from torchvision import _C_tests -except ImportError: - _C_tests = None - - -def process_model(model, tensor, func, name): - model.eval() - traced_script_module = torch.jit.trace(model, tensor) - traced_script_module.save("model.pt") - - py_output = model.forward(tensor) - cpp_output = func("model.pt", tensor) - - assert torch.allclose(py_output, cpp_output), "Output mismatch of " + name + " models" - - -def read_image1(): - image_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg" - ) - image = Image.open(image_path) - image = image.resize((224, 224)) - x = F.pil_to_tensor(image) - x = F.convert_image_dtype(x) - return x.view(1, 3, 224, 224) - - -def read_image2(): - image_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg" - ) - image = Image.open(image_path) - image = image.resize((299, 299)) - x = F.pil_to_tensor(image) - x = F.convert_image_dtype(x) - x = x.view(1, 3, 299, 299) - return torch.cat([x, x], 0) - - -@unittest.skipIf( - sys.platform == "darwin" or True, - "C++ models are broken on OS X at the moment, and there's a BC breakage on main; " - "see https://github.com/pytorch/vision/issues/1191", -) -class Tester(unittest.TestCase): - image = read_image1() - - def test_alexnet(self): - process_model(models.alexnet(), self.image, _C_tests.forward_alexnet, "Alexnet") - - def test_vgg11(self): - process_model(models.vgg11(), self.image, _C_tests.forward_vgg11, "VGG11") - - def test_vgg13(self): - process_model(models.vgg13(), self.image, _C_tests.forward_vgg13, "VGG13") - - def test_vgg16(self): - process_model(models.vgg16(), self.image, _C_tests.forward_vgg16, "VGG16") - - def test_vgg19(self): - process_model(models.vgg19(), self.image, _C_tests.forward_vgg19, "VGG19") - - def test_vgg11_bn(self): - process_model(models.vgg11_bn(), self.image, _C_tests.forward_vgg11bn, "VGG11BN") - - def test_vgg13_bn(self): - process_model(models.vgg13_bn(), self.image, _C_tests.forward_vgg13bn, "VGG13BN") - - def test_vgg16_bn(self): - process_model(models.vgg16_bn(), self.image, _C_tests.forward_vgg16bn, "VGG16BN") - - def test_vgg19_bn(self): - process_model(models.vgg19_bn(), self.image, _C_tests.forward_vgg19bn, "VGG19BN") - - def test_resnet18(self): - process_model(models.resnet18(), self.image, _C_tests.forward_resnet18, "Resnet18") - - def test_resnet34(self): - process_model(models.resnet34(), self.image, _C_tests.forward_resnet34, "Resnet34") - - def test_resnet50(self): - process_model(models.resnet50(), self.image, _C_tests.forward_resnet50, "Resnet50") - - def test_resnet101(self): - process_model(models.resnet101(), self.image, _C_tests.forward_resnet101, "Resnet101") - - def test_resnet152(self): - process_model(models.resnet152(), self.image, _C_tests.forward_resnet152, "Resnet152") - - def test_resnext50_32x4d(self): - process_model(models.resnext50_32x4d(), self.image, _C_tests.forward_resnext50_32x4d, "ResNext50_32x4d") - - def test_resnext101_32x8d(self): - process_model(models.resnext101_32x8d(), self.image, _C_tests.forward_resnext101_32x8d, "ResNext101_32x8d") - - def test_wide_resnet50_2(self): - process_model(models.wide_resnet50_2(), self.image, _C_tests.forward_wide_resnet50_2, "WideResNet50_2") - - def test_wide_resnet101_2(self): - process_model(models.wide_resnet101_2(), self.image, _C_tests.forward_wide_resnet101_2, "WideResNet101_2") - - def test_squeezenet1_0(self): - process_model(models.squeezenet1_0(), self.image, _C_tests.forward_squeezenet1_0, "Squeezenet1.0") - - def test_squeezenet1_1(self): - process_model(models.squeezenet1_1(), self.image, _C_tests.forward_squeezenet1_1, "Squeezenet1.1") - - def test_densenet121(self): - process_model(models.densenet121(), self.image, _C_tests.forward_densenet121, "Densenet121") - - def test_densenet169(self): - process_model(models.densenet169(), self.image, _C_tests.forward_densenet169, "Densenet169") - - def test_densenet201(self): - process_model(models.densenet201(), self.image, _C_tests.forward_densenet201, "Densenet201") - - def test_densenet161(self): - process_model(models.densenet161(), self.image, _C_tests.forward_densenet161, "Densenet161") - - def test_mobilenet_v2(self): - process_model(models.mobilenet_v2(), self.image, _C_tests.forward_mobilenetv2, "MobileNet") - - def test_googlenet(self): - process_model(models.googlenet(), self.image, _C_tests.forward_googlenet, "GoogLeNet") - - def test_mnasnet0_5(self): - process_model(models.mnasnet0_5(), self.image, _C_tests.forward_mnasnet0_5, "MNASNet0_5") - - def test_mnasnet0_75(self): - process_model(models.mnasnet0_75(), self.image, _C_tests.forward_mnasnet0_75, "MNASNet0_75") - - def test_mnasnet1_0(self): - process_model(models.mnasnet1_0(), self.image, _C_tests.forward_mnasnet1_0, "MNASNet1_0") - - def test_mnasnet1_3(self): - process_model(models.mnasnet1_3(), self.image, _C_tests.forward_mnasnet1_3, "MNASNet1_3") - - def test_inception_v3(self): - self.image = read_image2() - process_model(models.inception_v3(), self.image, _C_tests.forward_inceptionv3, "Inceptionv3") - - -if __name__ == "__main__": - unittest.main() diff --git a/test/test_datasets.py b/test/test_datasets.py index ad31856cd01..22c14cbc08d 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -8,12 +8,13 @@ import pathlib import pickle import random +import re import shutil import string import unittest import xml.etree.ElementTree as ET import zipfile -from typing import Callable, Tuple, Union +from typing import Callable, Union import datasets_utils import numpy as np @@ -21,12 +22,15 @@ import pytest import torch import torch.nn.functional as F +from common_utils import combinations_grid from torchvision import datasets +from torchvision.io import decode_image +from torchvision.transforms import v2 class STL10TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.STL10 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled")) @staticmethod def _make_binary_file(num_elements, root, name): @@ -112,9 +116,7 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Caltech101 FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple)) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( - target_type=("category", "annotation", ["category", "annotation"]) - ) + ADDITIONAL_CONFIGS = combinations_grid(target_type=("category", "annotation", ["category", "annotation"])) REQUIRED_PACKAGES = ("scipy",) def inject_fake_data(self, tmpdir, config): @@ -183,6 +185,11 @@ def test_combined_targets(self): ), "Type of the combined target does not match the type of the corresponding individual target: " f"{actual} is not {expected}", + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(target_type="category", transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Caltech256 @@ -190,7 +197,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories" - categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter")) + categories = ((1, "ak47"), (2, "american-flag"), (3, "backpack")) num_images_per_category = 2 for idx, category in categories: @@ -207,7 +214,7 @@ def inject_fake_data(self, tmpdir, config): class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.WIDERFace FEATURE_TYPES = (PIL.Image.Image, (dict, type(None))) # test split returns None as target - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) def inject_fake_data(self, tmpdir, config): widerface_dir = pathlib.Path(tmpdir) / "widerface" @@ -258,6 +265,11 @@ def inject_fake_data(self, tmpdir, config): return split_to_num_examples[config["split"]] + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class CityScapesTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Cityscapes @@ -268,8 +280,8 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase): "color", ) ADDITIONAL_CONFIGS = ( - *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES), - *datasets_utils.combinations_grid( + *combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES), + *combinations_grid( mode=("coarse",), split=("train", "train_extra", "val"), target_type=TARGET_TYPES, @@ -382,11 +394,19 @@ def test_feature_types_target_polygon(self): assert isinstance(polygon_img, PIL.Image.Image) (polygon_target, info["expected_polygon_target"]) + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + for target_type in ["instance", "semantic", ["instance", "semantic"]]: + with self.create_dataset(target_type=target_type, transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class ImageNetTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.ImageNet REQUIRED_PACKAGES = ("scipy",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val")) + + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) @@ -413,10 +433,15 @@ def inject_fake_data(self, tmpdir, config): torch.save((wnid_to_classes, None), tmpdir / "meta.bin") return num_examples + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CIFAR10 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) _VERSION_CONFIG = dict( base_folder="cifar-10-batches-py", @@ -489,7 +514,7 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CelebA FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "valid", "test", "all"), target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]), ) @@ -510,7 +535,8 @@ def inject_fake_data(self, tmpdir, config): self._create_bbox_txt(base_folder, num_images) self._create_landmarks_txt(base_folder, num_images) - return dict(num_examples=num_images_per_split[config["split"]], attr_names=attr_names) + num_samples = num_images_per_split.get(config["split"], 0) if isinstance(config["split"], str) else 0 + return dict(num_examples=num_samples, attr_names=attr_names) def _create_split_txt(self, root): num_images_per_split = dict(train=4, valid=3, test=2) @@ -607,25 +633,46 @@ def test_images_names_split(self): assert merged_imgs_names == all_imgs_names + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + for target_type in ["identity", "bbox", ["identity", "bbox"]]: + with self.create_dataset(target_type=target_type, transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + + def test_invalid_split_list(self): + with pytest.raises(ValueError, match="Expected type str for argument split, but got type ."): + with self.create_dataset(split=[1]): + pass + + def test_invalid_split_int(self): + with pytest.raises(ValueError, match="Expected type str for argument split, but got type ."): + with self.create_dataset(split=1): + pass + + def test_invalid_split_value(self): + with pytest.raises( + ValueError, + match="Unknown value '{value}' for argument {arg}. Valid values are {{{valid_values}}}.".format( + value="invalid", + arg="split", + valid_values=("train", "valid", "test", "all"), + ), + ): + with self.create_dataset(split="invalid"): + pass + class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.VOCSegmentation FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image) ADDITIONAL_CONFIGS = ( - *datasets_utils.combinations_grid( - year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval") - ), + *combinations_grid(year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")), dict(year="2007", image_set="test"), - dict(year="2007-test", image_set="test"), ) def inject_fake_data(self, tmpdir, config): - year, is_test_set = ( - ("2007", True) - if config["year"] == "2007-test" or config["image_set"] == "test" - else (config["year"], False) - ) + year, is_test_set = config["year"], config["image_set"] == "test" image_set = config["image_set"] base_dir = pathlib.Path(tmpdir) @@ -701,6 +748,11 @@ def add_bndbox(obj, bndbox=None): return data + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class VOCDetectionTestCase(VOCSegmentationTestCase): DATASET_CLASS = datasets.VOCDetection @@ -721,6 +773,11 @@ def test_annotations(self): assert object == info["annotation"] + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CocoDetection @@ -751,28 +808,52 @@ def inject_fake_data(self, tmpdir, config): annotation_folder = tmpdir / self._ANNOTATIONS_FOLDER os.makedirs(annotation_folder) + + segmentation_kind = config.pop("segmentation_kind", "list") info = self._create_annotation_file( - annotation_folder, self._ANNOTATIONS_FILE, file_names, num_annotations_per_image + annotation_folder, + self._ANNOTATIONS_FILE, + file_names, + num_annotations_per_image, + segmentation_kind=segmentation_kind, ) info["num_examples"] = num_images return info - def _create_annotation_file(self, root, name, file_names, num_annotations_per_image): + def _create_annotation_file(self, root, name, file_names, num_annotations_per_image, segmentation_kind="list"): image_ids = [int(file_name.stem) for file_name in file_names] images = [dict(file_name=str(file_name), id=id) for file_name, id in zip(file_names, image_ids)] - annotations, info = self._create_annotations(image_ids, num_annotations_per_image) + annotations, info = self._create_annotations(image_ids, num_annotations_per_image, segmentation_kind) self._create_json(root, name, dict(images=images, annotations=annotations)) return info - def _create_annotations(self, image_ids, num_annotations_per_image): - annotations = datasets_utils.combinations_grid( - image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image - ) - for id, annotation in enumerate(annotations): - annotation["id"] = id + def _create_annotations(self, image_ids, num_annotations_per_image, segmentation_kind="list"): + annotations = [] + annotion_id = 0 + + for image_id in itertools.islice(itertools.cycle(image_ids), len(image_ids) * num_annotations_per_image): + segmentation = { + "list": [torch.rand(8).tolist()], + "rle": {"size": [10, 10], "counts": [1]}, + "rle_encoded": {"size": [2400, 2400], "counts": "PQRQ2[1\\Y2f0gNVNRhMg2"}, + "bad": 123, + }[segmentation_kind] + + annotations.append( + dict( + image_id=image_id, + id=annotion_id, + bbox=torch.rand(4).tolist(), + segmentation=segmentation, + category_id=int(torch.randint(91, ())), + area=float(torch.rand(1)), + iscrowd=int(torch.randint(2, size=(1,))), + ) + ) + annotion_id += 1 return annotations, dict() def _create_json(self, root, name, content): @@ -781,13 +862,39 @@ def _create_json(self, root, name, content): json.dump(content, fh) return file + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + + def test_slice_error(self): + with self.create_dataset() as (dataset, _): + with pytest.raises(ValueError, match="Index must be of type integer"): + dataset[:2] + + def test_segmentation_kind(self): + if isinstance(self, CocoCaptionsTestCase): + return + + for segmentation_kind in ("list", "rle", "rle_encoded"): + config = {"segmentation_kind": segmentation_kind} + with self.create_dataset(config) as (dataset, _): + dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys="all") + list(dataset) + + config = {"segmentation_kind": "bad"} + with self.create_dataset(config) as (dataset, _): + dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys="all") + with pytest.raises(ValueError, match="COCO segmentation expected to be a dict or a list"): + list(dataset) + class CocoCaptionsTestCase(CocoDetectionTestCase): DATASET_CLASS = datasets.CocoCaptions - def _create_annotations(self, image_ids, num_annotations_per_image): + def _create_annotations(self, image_ids, num_annotations_per_image, segmentation_kind="list"): captions = [str(idx) for idx in range(num_annotations_per_image)] - annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions) + annotations = combinations_grid(image_id=image_ids, caption=captions) for id, annotation in enumerate(annotations): annotation["id"] = id return annotations, dict(captions=captions) @@ -797,11 +904,16 @@ def test_captions(self): _, captions = dataset[0] assert tuple(captions) == tuple(info["captions"]) + def test_transforms_v2_wrapper_spawn(self): + # We need to define this method, because otherwise the test from the super class will + # be run + pytest.skip("CocoCaptions is currently not supported by the v2 wrapper.") + class UCF101TestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.UCF101 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False)) _VIDEO_FOLDER = "videos" _ANNOTATIONS_FOLDER = "annotations" @@ -862,9 +974,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.LSUN REQUIRED_PACKAGES = ("lmdb",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( - classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"]) - ) + ADDITIONAL_CONFIGS = combinations_grid(classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])) _CATEGORIES = ( "bedroom", @@ -949,7 +1059,7 @@ def test_not_found_or_corrupted(self): class KineticsTestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.Kinetics - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"), num_classes=("400", "600", "700")) def inject_fake_data(self, tmpdir, config): classes = ("Abseiling", "Zumba") @@ -965,30 +1075,17 @@ def inject_fake_data(self, tmpdir, config): ) return num_videos_per_class * len(classes) - -class Kinetics400TestCase(datasets_utils.VideoDatasetTestCase): - DATASET_CLASS = datasets.Kinetics400 - - def inject_fake_data(self, tmpdir, config): - classes = ("Abseiling", "Zumba") - num_videos_per_class = 2 - - digits = string.ascii_letters + string.digits + "-_" - for cls in classes: - datasets_utils.create_video_folder( - tmpdir, - cls, - lambda _: f"{datasets_utils.create_random_string(11, digits)}.avi", - num_videos_per_class, - ) - - return num_videos_per_class * len(classes) + @pytest.mark.xfail(reason="FIXME") + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(output_format="TCHW", transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) class HMDB51TestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.HMDB51 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False)) _VIDEO_FOLDER = "videos" _SPLITS_FOLDER = "splits" @@ -1048,7 +1145,8 @@ def _create_split_files(self, root, video_files, fold, train): class OmniglotTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Omniglot - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(background=(True, False)) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): target_folder = ( @@ -1079,6 +1177,8 @@ class SBUTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SBU FEATURE_TYPES = (PIL.Image.Image, str) + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): num_images = 3 @@ -1128,7 +1228,7 @@ def inject_fake_data(self, tmpdir, config): class USPSTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.USPS - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) def inject_fake_data(self, tmpdir, config): num_images = 2 if config["train"] else 1 @@ -1150,7 +1250,7 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase): REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse") - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation") ) @@ -1211,6 +1311,11 @@ def _create_segmentation(self, size): def _file_stem(self, idx): return f"2008_{idx:06d}" + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(mode="segmentation", transforms=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class FakeDataTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FakeData @@ -1236,7 +1341,7 @@ class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase): _TRAIN_FEATURE_TYPES = (torch.Tensor,) _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor) - datasets_utils.combinations_grid(train=(True, False)) + combinations_grid(train=(True, False)) _NAME = "liberty" @@ -1312,6 +1417,8 @@ class Flickr8kTestCase(datasets_utils.ImageDatasetTestCase): _IMAGES_FOLDER = "images" _ANNOTATIONS_FILE = "captions.html" + SUPPORT_TV_IMAGE_DECODE = True + def dataset_args(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) root = tmpdir / self._IMAGES_FOLDER @@ -1381,6 +1488,8 @@ class Flickr30kTestCase(Flickr8kTestCase): _ANNOTATIONS_FILE = "captions.token" + SUPPORT_TV_IMAGE_DECODE = True + def _image_file_name(self, idx): return f"{idx}.jpg" @@ -1395,7 +1504,7 @@ def _create_annotations_file(self, root, name, images, num_captions_per_image): class MNISTTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.MNIST - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) _MAGIC_DTYPES = { torch.uint8: 8, @@ -1465,7 +1574,7 @@ class EMNISTTestCase(MNISTTestCase): DATASET_CLASS = datasets.EMNIST DEFAULT_CONFIG = dict(split="byclass") - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("byclass", "bymerge", "balanced", "letters", "digits", "mnist"), train=(True, False) ) @@ -1476,7 +1585,7 @@ def _prefix(self, config): class QMNISTTestCase(MNISTTestCase): DATASET_CLASS = datasets.QMNIST - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(what=("train", "test", "test10k", "nist")) + ADDITIONAL_CONFIGS = combinations_grid(what=("train", "test", "test10k", "nist")) _LABELS_SIZE = (8,) _LABELS_DTYPE = torch.int32 @@ -1518,30 +1627,51 @@ def test_num_examples_test50k(self): assert len(dataset) == info["num_examples"] - 10000 +class MovingMNISTTestCase(datasets_utils.DatasetTestCase): + DATASET_CLASS = datasets.MovingMNIST + FEATURE_TYPES = (torch.Tensor,) + + ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19)) + + _NUM_FRAMES = 20 + + def inject_fake_data(self, tmpdir, config): + base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__) + os.makedirs(base_folder, exist_ok=True) + num_samples = 5 + data = np.concatenate( + [ + np.zeros((config["split_ratio"], num_samples, 64, 64)), + np.ones((self._NUM_FRAMES - config["split_ratio"], num_samples, 64, 64)), + ] + ) + np.save(os.path.join(base_folder, "mnist_test_seq.npy"), data) + return num_samples + + @datasets_utils.test_all_configs + def test_split(self, config): + with self.create_dataset(config) as (dataset, _): + if config["split"] == "train": + assert (dataset.data == 0).all() + elif config["split"] == "test": + assert (dataset.data == 1).all() + else: + assert dataset.data.size()[1] == self._NUM_FRAMES + + class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.DatasetFolder - # The dataset has no fixed return type since it is defined by the loader parameter. For testing, we use a loader - # that simply returns the path as type 'str' instead of loading anything. See the 'dataset_args()' method. - FEATURE_TYPES = (str, int) - - _IMAGE_EXTENSIONS = ("jpg", "png") - _VIDEO_EXTENSIONS = ("avi", "mp4") - _EXTENSIONS = (*_IMAGE_EXTENSIONS, *_VIDEO_EXTENSIONS) + _EXTENSIONS = ("jpg", "png") # DatasetFolder has two mutually exclusive parameters: 'extensions' and 'is_valid_file'. One of both is required. # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the # 'test_is_valid_file()' method. DEFAULT_CONFIG = dict(extensions=_EXTENSIONS) - ADDITIONAL_CONFIGS = ( - *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _IMAGE_EXTENSIONS]), - dict(extensions=_IMAGE_EXTENSIONS), - *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _VIDEO_EXTENSIONS]), - dict(extensions=_VIDEO_EXTENSIONS), - ) + ADDITIONAL_CONFIGS = combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS]) def dataset_args(self, tmpdir, config): - return tmpdir, lambda x: x + return tmpdir, datasets.folder.pil_loader def inject_fake_data(self, tmpdir, config): extensions = config["extensions"] or self._is_valid_file_to_extensions(config["is_valid_file"]) @@ -1552,18 +1682,16 @@ def inject_fake_data(self, tmpdir, config): if ext not in extensions: continue - create_example_folder = ( - datasets_utils.create_image_folder - if ext in self._IMAGE_EXTENSIONS - else datasets_utils.create_video_folder - ) - num_examples = torch.randint(1, 3, size=()).item() - create_example_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples) + datasets_utils.create_image_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples) num_examples_total += num_examples classes.append(cls) + if config.pop("make_empty_class", False): + os.makedirs(pathlib.Path(tmpdir) / "empty_class") + classes.append("empty_class") + return dict(num_examples=num_examples_total, classes=classes) def _file_name_fn(self, cls, ext, idx): @@ -1588,6 +1716,23 @@ def test_classes(self, config): assert len(dataset.classes) == len(info["classes"]) assert all([a == b for a, b in zip(dataset.classes, info["classes"])]) + def test_allow_empty(self): + config = { + "extensions": self._EXTENSIONS, + "make_empty_class": True, + } + + config["allow_empty"] = True + with self.create_dataset(config) as (dataset, info): + assert "empty_class" in dataset.classes + assert len(dataset.classes) == len(info["classes"]) + assert all([a == b for a, b in zip(dataset.classes, info["classes"])]) + + config["allow_empty"] = False + with pytest.raises(FileNotFoundError, match="Found no valid file"): + with self.create_dataset(config) as (dataset, info): + pass + class ImageFolderTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.ImageFolder @@ -1613,7 +1758,7 @@ def test_classes(self, config): class KittiTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Kitti FEATURE_TYPES = (PIL.Image.Image, (list, type(None))) # test split returns None as target - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) def inject_fake_data(self, tmpdir, config): kitti_dir = os.path.join(tmpdir, "Kitti", "raw") @@ -1645,11 +1790,16 @@ def inject_fake_data(self, tmpdir, config): return split_to_num_examples[config["train"]] + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class SvhnTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SVHN REQUIRED_PACKAGES = ("scipy",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "extra")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "extra")) def inject_fake_data(self, tmpdir, config): import scipy.io as sio @@ -1670,7 +1820,7 @@ def inject_fake_data(self, tmpdir, config): class Places365TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Places365 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train-standard", "train-challenge", "val"), small=(False, True), ) @@ -1752,20 +1902,16 @@ def test_class_to_idx(self): with self.create_dataset() as (dataset, _): assert dataset.class_to_idx == class_to_idx - def test_images_download_preexisting(self): - with pytest.raises(RuntimeError): - with self.create_dataset({"download": True}): - pass - class INaturalistTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.INaturalist FEATURE_TYPES = (PIL.Image.Image, (int, tuple)) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]), version=("2021_train",), ) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): categories = [ @@ -1799,12 +1945,14 @@ def test_targets(self): class LFWPeopleTestCase(datasets_utils.DatasetTestCase): DATASET_CLASS = datasets.LFWPeople FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("10fold", "train", "test"), image_set=("original", "funneled", "deepfunneled") ) _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"} _file_id = {"10fold": "", "train": "DevTrain", "test": "DevTest"} + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) / "lfw-py" os.makedirs(tmpdir, exist_ok=True) @@ -1841,6 +1989,18 @@ def _create_random_id(self): part2 = datasets_utils.create_random_string(random.randint(4, 7)) return f"{part1}_{part2}" + def test_tv_decode_image_support(self): + if not self.SUPPORT_TV_IMAGE_DECODE: + pytest.skip(f"{self.DATASET_CLASS.__name__} does not support torchvision.io.decode_image.") + + with self.create_dataset( + config=dict( + loader=decode_image, + ) + ) as (dataset, _): + image = dataset[0][0] + assert isinstance(image, torch.Tensor) + class LFWPairsTestCase(LFWPeopleTestCase): DATASET_CLASS = datasets.LFWPairs @@ -1875,11 +2035,13 @@ def _inject_pairs(self, root, num_pairs, same): class SintelTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Sintel - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) FLOW_H, FLOW_W = 3, 4 + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "Sintel" @@ -1943,9 +2105,11 @@ def test_bad_input(self): class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.KittiFlow - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "KittiFlow" @@ -2003,7 +2167,7 @@ def test_bad_input(self): class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FlyingChairs - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) FLOW_H, FLOW_W = 3, 4 @@ -2058,13 +2222,15 @@ def test_flow(self, config): class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FlyingThings3D - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "test"), pass_name=("clean", "final", "both"), camera=("left", "right", "both") ) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) FLOW_H, FLOW_W = 3, 4 + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "FlyingThings3D" @@ -2131,6 +2297,8 @@ def test_bad_input(self): class HD1KTestCase(KittiFlowTestCase): DATASET_CLASS = datasets.HD1K + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir, config): root = pathlib.Path(tmpdir) / "hd1k" @@ -2173,6 +2341,7 @@ def inject_fake_data(self, tmpdir, config): class EuroSATTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.EuroSAT FEATURE_TYPES = (PIL.Image.Image, int) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): data_folder = os.path.join(tmpdir, "eurosat", "2750") @@ -2195,7 +2364,9 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Food101 FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) + + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir: str, config): root_folder = pathlib.Path(tmpdir) / "food-101" @@ -2230,9 +2401,10 @@ def inject_fake_data(self, tmpdir: str, config): class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FGVCAircraft - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer") ) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir: str, config): split = config["split"] @@ -2282,6 +2454,8 @@ def inject_fake_data(self, tmpdir: str, config): class SUN397TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SUN397 + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir: str, config): data_dir = pathlib.Path(tmpdir) / "SUN397" data_dir.mkdir() @@ -2313,7 +2487,9 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.DTD FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + SUPPORT_TV_IMAGE_DECODE = True + + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "test", "val"), # There is no need to test the whole matrix here, since each fold is treated exactly the same partition=(1, 5, 10), @@ -2347,7 +2523,7 @@ def inject_fake_data(self, tmpdir: str, config): class FER2013TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FER2013 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, (int, type(None))) @@ -2355,34 +2531,74 @@ def inject_fake_data(self, tmpdir, config): base_folder = os.path.join(tmpdir, "fer2013") os.makedirs(base_folder) + use_icml = config.pop("use_icml", False) + use_fer = config.pop("use_fer", False) + num_samples = 5 - with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file: - writer = csv.DictWriter( - file, - fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",), - quoting=csv.QUOTE_NONNUMERIC, - quotechar='"', - ) - writer.writeheader() - for _ in range(num_samples): - row = dict( - pixels=" ".join( - str(pixel) for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist() - ) + + if use_icml or use_fer: + pixels_key, usage_key = (" pixels", " Usage") if use_icml else ("pixels", "Usage") + fieldnames = ("emotion", usage_key, pixels_key) if use_icml else ("emotion", pixels_key, usage_key) + filename = "icml_face_data.csv" if use_icml else "fer2013.csv" + with open(os.path.join(base_folder, filename), "w", newline="") as file: + writer = csv.DictWriter( + file, + fieldnames=fieldnames, + quoting=csv.QUOTE_NONNUMERIC, + quotechar='"', ) - if config["split"] == "train": - row["emotion"] = str(int(torch.randint(0, 7, ()))) + writer.writeheader() + for i in range(num_samples): + row = { + "emotion": str(int(torch.randint(0, 7, ()))), + usage_key: "Training" if i % 2 else "PublicTest", + pixels_key: " ".join( + str(pixel) + for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist() + ), + } + + writer.writerow(row) + else: + with open(os.path.join(base_folder, f"{config['split']}.csv"), "w", newline="") as file: + writer = csv.DictWriter( + file, + fieldnames=("emotion", "pixels") if config["split"] == "train" else ("pixels",), + quoting=csv.QUOTE_NONNUMERIC, + quotechar='"', + ) + writer.writeheader() + for _ in range(num_samples): + row = dict( + pixels=" ".join( + str(pixel) + for pixel in datasets_utils.create_image_or_video_tensor((48, 48)).view(-1).tolist() + ) + ) + if config["split"] == "train": + row["emotion"] = str(int(torch.randint(0, 7, ()))) - writer.writerow(row) + writer.writerow(row) return num_samples + def test_icml_file(self): + config = {"split": "test"} + with self.create_dataset(config=config) as (dataset, _): + assert all(s[1] is None for s in dataset) + + for split in ("train", "test"): + for d in ({"use_icml": True}, {"use_fer": True}): + config = {"split": split, **d} + with self.create_dataset(config=config) as (dataset, _): + assert all(s[1] is not None for s in dataset) + class GTSRBTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.GTSRB FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) def inject_fake_data(self, tmpdir: str, config): root_folder = os.path.join(tmpdir, "gtsrb") @@ -2432,7 +2648,8 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CLEVRClassification FEATURE_TYPES = (PIL.Image.Image, (int, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0" @@ -2464,9 +2681,9 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.OxfordIIITPet FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("trainval", "test"), - target_types=("category", "segmentation", ["category", "segmentation"], []), + target_types=("category", "binary-category", "segmentation", ["category", "segmentation"], []), ) def inject_fake_data(self, tmpdir, config): @@ -2519,11 +2736,18 @@ def _meta_to_split_and_classification_ann(self, meta, idx): breed_id = "-1" return (image_id, class_id, species, breed_id) + def test_transforms_v2_wrapper_spawn(self): + expected_size = (123, 321) + with self.create_dataset(transform=v2.Resize(size=expected_size)) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset, expected_size=expected_size) + class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.StanfordCars REQUIRED_PACKAGES = ("scipy",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) + + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir, config): import scipy.io as io @@ -2567,7 +2791,9 @@ def inject_fake_data(self, tmpdir, config): class Country211TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Country211 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test")) + + SUPPORT_TV_IMAGE_DECODE = True def inject_fake_data(self, tmpdir: str, config): split_folder = pathlib.Path(tmpdir) / "country211" / config["split"] @@ -2594,9 +2820,11 @@ def inject_fake_data(self, tmpdir: str, config): class Flowers102TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Flowers102 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) REQUIRED_PACKAGES = ("scipy",) + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir: str, config): base_folder = pathlib.Path(tmpdir) / "flowers-102" @@ -2630,7 +2858,7 @@ def inject_fake_data(self, tmpdir: str, config): class PCAMTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.PCAM - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) REQUIRED_PACKAGES = ("h5py",) def inject_fake_data(self, tmpdir: str, config): @@ -2652,9 +2880,11 @@ def inject_fake_data(self, tmpdir: str, config): class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.RenderedSST2 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"} + SUPPORT_TV_IMAGE_DECODE = True + def inject_fake_data(self, tmpdir: str, config): root_folder = pathlib.Path(tmpdir) / "rendered-sst2" image_folder = root_folder / self.SPLIT_TO_FOLDER[config["split"]] @@ -2674,7 +2904,7 @@ def inject_fake_data(self, tmpdir: str, config): class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Kitti2012Stereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -2736,7 +2966,7 @@ def test_bad_input(self): class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Kitti2015Stereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -2841,19 +3071,50 @@ def test_train_splits(self): datasets_utils.shape_test_for_stereo(left, right, disparity) +class CREStereoTestCase(datasets_utils.ImageDatasetTestCase): + DATASET_CLASS = datasets.CREStereo + FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, np.ndarray, type(None)) + + def inject_fake_data(self, tmpdir, config): + crestereo_dir = pathlib.Path(tmpdir) / "CREStereo" + os.makedirs(crestereo_dir, exist_ok=True) + + examples = {"tree": 2, "shapenet": 3, "reflective": 6, "hole": 5} + + for category_name in ["shapenet", "reflective", "tree", "hole"]: + split_dir = crestereo_dir / category_name + os.makedirs(split_dir, exist_ok=True) + num_examples = examples[category_name] + + for idx in range(num_examples): + datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.jpg", size=(100, 100)) + datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.jpg", size=(100, 100)) + # these are going to end up being gray scale images + datasets_utils.create_image_file(root=split_dir, name=f"{idx}_left.disp.png", size=(1, 100, 100)) + datasets_utils.create_image_file(root=split_dir, name=f"{idx}_right.disp.png", size=(1, 100, 100)) + + return sum(examples.values()) + + def test_splits(self): + with self.create_dataset() as (dataset, _): + for left, right, disparity, mask in dataset: + assert mask is None + datasets_utils.shape_test_for_stereo(left, right, disparity) + + class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FallingThingsStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both")) + ADDITIONAL_CONFIGS = combinations_grid(variant=("single", "mixed", "both")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) @staticmethod - def _make_dummy_depth_map(root: str, name: str, size: Tuple[int, int]): + def _make_dummy_depth_map(root: str, name: str, size: tuple[int, int]): file = pathlib.Path(root) / name image = np.ones((size[0], size[1]), dtype=np.uint8) PIL.Image.fromarray(image).save(file) @staticmethod - def _make_scene_folder(root: str, scene_name: str, size: Tuple[int, int]) -> None: + def _make_scene_folder(root: str, scene_name: str, size: tuple[int, int]) -> None: root = pathlib.Path(root) / scene_name os.makedirs(root, exist_ok=True) # jpg images @@ -2917,14 +3178,14 @@ def test_bad_input(self): class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SceneFlowStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( variant=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final", "both") ) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) @staticmethod def _create_pfm_folder( - root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: Tuple[int, int] + root: str, name: str, file_name_fn: Callable[..., str], num_examples: int, size: tuple[int, int] ) -> None: root = pathlib.Path(root) / name os.makedirs(root, exist_ok=True) @@ -3004,10 +3265,10 @@ def test_bad_input(self): class InStereo2k(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.InStereo2k FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) @staticmethod - def _make_scene_folder(root: str, name: str, size: Tuple[int, int]): + def _make_scene_folder(root: str, name: str, size: tuple[int, int]): root = pathlib.Path(root) / name os.makedirs(root, exist_ok=True) @@ -3046,7 +3307,7 @@ def test_bad_input(self): class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SintelStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both")) + ADDITIONAL_CONFIGS = combinations_grid(pass_name=("final", "clean", "both")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -3122,7 +3383,7 @@ def test_bad_input(self): class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.ETH3DStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) @staticmethod @@ -3187,5 +3448,177 @@ def test_bad_input(self): pass +class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase): + DATASET_CLASS = datasets.Middlebury2014Stereo + ADDITIONAL_CONFIGS = combinations_grid( + split=("train", "additional"), + calibration=("perfect", "imperfect", "both"), + use_ambient_views=(True, False), + ) + FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) + + @staticmethod + def _make_scene_folder(root_dir: str, scene_name: str, split: str) -> None: + calibrations = [None] if split == "test" else ["-perfect", "-imperfect"] + root_dir = pathlib.Path(root_dir) + + for c in calibrations: + scene_dir = root_dir / f"{scene_name}{c}" + os.makedirs(scene_dir, exist_ok=True) + # make normal images first + datasets_utils.create_image_file(root=scene_dir, name="im0.png", size=(3, 100, 100)) + datasets_utils.create_image_file(root=scene_dir, name="im1.png", size=(3, 100, 100)) + datasets_utils.create_image_file(root=scene_dir, name="im1E.png", size=(3, 100, 100)) + datasets_utils.create_image_file(root=scene_dir, name="im1L.png", size=(3, 100, 100)) + # these are going to end up being gray scale images + datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=scene_dir / "disp0.pfm") + datasets_utils.make_fake_pfm_file(h=100, w=100, file_name=scene_dir / "disp1.pfm") + + def inject_fake_data(self, tmpdir, config): + split_scene_map = { + "train": ["Adirondack", "Jadeplant", "Motorcycle", "Piano"], + "additional": ["Backpack", "Bicycle1", "Cable", "Classroom1"], + "test": ["Plants", "Classroom2E", "Classroom2", "Australia"], + } + + middlebury_dir = pathlib.Path(tmpdir, "Middlebury2014") + os.makedirs(middlebury_dir, exist_ok=True) + + split_dir = middlebury_dir / config["split"] + os.makedirs(split_dir, exist_ok=True) + + num_examples = {"train": 2, "additional": 3, "test": 4}.get(config["split"], 0) + for idx in range(num_examples): + scene_name = split_scene_map[config["split"]][idx] + self._make_scene_folder(root_dir=split_dir, scene_name=scene_name, split=config["split"]) + + if config["calibration"] == "both": + num_examples *= 2 + return num_examples + + def test_train_splits(self): + for split, calibration in itertools.product(["train", "additional"], ["perfect", "imperfect", "both"]): + with self.create_dataset(split=split, calibration=calibration) as (dataset, _): + for left, right, disparity, mask in dataset: + datasets_utils.shape_test_for_stereo(left, right, disparity, mask) + + def test_test_split(self): + for split in ["test"]: + with self.create_dataset(split=split, calibration=None) as (dataset, _): + for left, right, disparity, mask in dataset: + datasets_utils.shape_test_for_stereo(left, right) + + def test_augmented_view_usage(self): + with self.create_dataset(split="train", use_ambient_views=True) as (dataset, _): + for left, right, disparity, mask in dataset: + datasets_utils.shape_test_for_stereo(left, right, disparity, mask) + + def test_value_err_train(self): + # train set invalid + split = "train" + calibration = None + with pytest.raises( + ValueError, + match=f"Split '{split}' has calibration settings, however None was provided as an argument." + f"\nSetting calibration to 'perfect' for split '{split}'. Available calibration settings are: 'perfect', 'imperfect', 'both'.", + ): + with self.create_dataset(split=split, calibration=calibration): + pass + + def test_value_err_test(self): + # test set invalid + split = "test" + calibration = "perfect" + with pytest.raises( + ValueError, match="Split 'test' has only no calibration settings, please set `calibration=None`." + ): + with self.create_dataset(split=split, calibration=calibration): + pass + + def test_bad_input(self): + with pytest.raises(ValueError, match="Unknown value 'bad' for argument split"): + with self.create_dataset(split="bad"): + pass + + +class ImagenetteTestCase(datasets_utils.ImageDatasetTestCase): + DATASET_CLASS = datasets.Imagenette + ADDITIONAL_CONFIGS = combinations_grid(split=["train", "val"], size=["full", "320px", "160px"]) + + SUPPORT_TV_IMAGE_DECODE = True + + _WNIDS = [ + "n01440764", + "n02102040", + "n02979186", + "n03000684", + "n03028079", + "n03394916", + "n03417042", + "n03425413", + "n03445777", + "n03888257", + ] + + def inject_fake_data(self, tmpdir, config): + archive_root = "imagenette2" + if config["size"] != "full": + archive_root += f"-{config['size'].replace('px', '')}" + image_root = pathlib.Path(tmpdir) / archive_root / config["split"] + + num_images_per_class = 3 + for wnid in self._WNIDS: + datasets_utils.create_image_folder( + root=image_root, + name=wnid, + file_name_fn=lambda idx: f"{wnid}_{idx}.JPEG", + num_examples=num_images_per_class, + ) + + return num_images_per_class * len(self._WNIDS) + + +class TestDatasetWrapper: + def test_unknown_type(self): + unknown_object = object() + with pytest.raises( + TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`") + ): + datasets.wrap_dataset_for_transforms_v2(unknown_object) + + def test_unknown_dataset(self): + class MyVisionDataset(datasets.VisionDataset): + pass + + dataset = MyVisionDataset("root") + + with pytest.raises(TypeError, match="No wrapper exist"): + datasets.wrap_dataset_for_transforms_v2(dataset) + + def test_missing_wrapper(self): + dataset = datasets.FakeData() + + with pytest.raises(TypeError, match="please open an issue"): + datasets.wrap_dataset_for_transforms_v2(dataset) + + def test_subclass(self, mocker): + from torchvision import tv_tensors + + sentinel = object() + mocker.patch.dict( + tv_tensors._dataset_wrapper.WRAPPER_FACTORIES, + clear=False, + values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel}, + ) + + class MyFakeData(datasets.FakeData): + pass + + dataset = MyFakeData() + wrapped_dataset = datasets.wrap_dataset_for_transforms_v2(dataset) + + assert wrapped_dataset[0] is sentinel + + if __name__ == "__main__": unittest.main() diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py index b4ae5a6d4b7..856a02b9d44 100644 --- a/test/test_datasets_download.py +++ b/test/test_datasets_download.py @@ -1,11 +1,12 @@ import contextlib import itertools +import shutil import tempfile import time +import traceback import unittest.mock import warnings from datetime import datetime -from distutils import dir_util from os import path from urllib.error import HTTPError, URLError from urllib.parse import urlparse @@ -13,13 +14,7 @@ import pytest from torchvision import datasets -from torchvision.datasets.utils import ( - _get_redirect_url, - check_integrity, - download_file_from_google_drive, - download_url, - USER_AGENT, -) +from torchvision.datasets.utils import _get_redirect_url, USER_AGENT def limit_requests_per_time(min_secs_between_requests=2.0): @@ -83,63 +78,65 @@ def inner_wrapper(request, *args, **kwargs): @contextlib.contextmanager def log_download_attempts( - urls_and_md5s=None, - file="utils", - patch=True, - mock_auxiliaries=None, + urls, + *, + dataset_module, ): - def add_mock(stack, name, file, **kwargs): + def maybe_add_mock(*, module, name, stack, lst=None): + patcher = unittest.mock.patch(f"torchvision.datasets.{module}.{name}") + try: - return stack.enter_context(unittest.mock.patch(f"torchvision.datasets.{file}.{name}", **kwargs)) - except AttributeError as error: - if file != "utils": - return add_mock(stack, name, "utils", **kwargs) - else: - raise pytest.UsageError from error - - if urls_and_md5s is None: - urls_and_md5s = set() - if mock_auxiliaries is None: - mock_auxiliaries = patch + mock = stack.enter_context(patcher) + except AttributeError: + return - with contextlib.ExitStack() as stack: - url_mock = add_mock(stack, "download_url", file, wraps=None if patch else download_url) - google_drive_mock = add_mock( - stack, "download_file_from_google_drive", file, wraps=None if patch else download_file_from_google_drive - ) + if lst is not None: + lst.append(mock) - if mock_auxiliaries: - add_mock(stack, "extract_archive", file) + with contextlib.ExitStack() as stack: + download_url_mocks = [] + download_file_from_google_drive_mocks = [] + for module in [dataset_module, "utils"]: + maybe_add_mock(module=module, name="download_url", stack=stack, lst=download_url_mocks) + maybe_add_mock( + module=module, + name="download_file_from_google_drive", + stack=stack, + lst=download_file_from_google_drive_mocks, + ) + maybe_add_mock(module=module, name="extract_archive", stack=stack) try: - yield urls_and_md5s + yield finally: - for args, kwargs in url_mock.call_args_list: - url = args[0] - md5 = args[-1] if len(args) == 4 else kwargs.get("md5") - urls_and_md5s.add((url, md5)) + for download_url_mock in download_url_mocks: + for args, kwargs in download_url_mock.call_args_list: + urls.append(args[0] if args else kwargs["url"]) - for args, kwargs in google_drive_mock.call_args_list: - id = args[0] - url = f"https://drive.google.com/file/d/{id}" - md5 = args[3] if len(args) == 4 else kwargs.get("md5") - urls_and_md5s.add((url, md5)) + for download_file_from_google_drive_mock in download_file_from_google_drive_mocks: + for args, kwargs in download_file_from_google_drive_mock.call_args_list: + file_id = args[0] if args else kwargs["file_id"] + urls.append(f"https://drive.google.com/file/d/{file_id}") def retry(fn, times=1, wait=5.0): - msgs = [] + tbs = [] for _ in range(times + 1): try: return fn() except AssertionError as error: - msgs.append(str(error)) + tbs.append("".join(traceback.format_exception(type(error), error, error.__traceback__))) time.sleep(wait) else: raise AssertionError( "\n".join( ( - f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time.\n", - *(f"{idx}: {error}" for idx, error in enumerate(msgs, 1)), + "\n", + *[f"{'_' * 40} {idx:2d} {'_' * 40}\n\n{tb}" for idx, tb in enumerate(tbs, 1)], + ( + f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time. " + f"You can find the the full tracebacks above." + ), ) ) ) @@ -149,10 +146,12 @@ def retry(fn, times=1, wait=5.0): def assert_server_response_ok(): try: yield - except URLError as error: - raise AssertionError("The request timed out.") from error except HTTPError as error: raise AssertionError(f"The server returned {error.code}: {error.reason}.") from error + except URLError as error: + raise AssertionError( + "Connection not possible due to SSL." if "SSL" in str(error) else "The request timed out." + ) from error except RecursionError as error: raise AssertionError(str(error)) from error @@ -163,45 +162,14 @@ def assert_url_is_accessible(url, timeout=5.0): urlopen(request, timeout=timeout) -def assert_file_downloads_correctly(url, md5, tmpdir, timeout=5.0): - file = path.join(tmpdir, path.basename(url)) - with assert_server_response_ok(): - with open(file, "wb") as fh: - request = Request(url, headers={"User-Agent": USER_AGENT}) - response = urlopen(request, timeout=timeout) - fh.write(response.read()) - - assert check_integrity(file, md5=md5), "The MD5 checksums mismatch" - - -class DownloadConfig: - def __init__(self, url, md5=None, id=None): - self.url = url - self.md5 = md5 - self.id = id or url +def collect_urls(dataset_cls, *args, **kwargs): + urls = [] + with contextlib.suppress(Exception), log_download_attempts( + urls, dataset_module=dataset_cls.__module__.split(".")[-1] + ): + dataset_cls(*args, **kwargs) - def __repr__(self) -> str: - return self.id - - -def make_download_configs(urls_and_md5s, name=None): - return [ - DownloadConfig(url, md5=md5, id=f"{name}, {url}" if name is not None else None) for url, md5 in urls_and_md5s - ] - - -def collect_download_configs(dataset_loader, name=None, **kwargs): - urls_and_md5s = set() - try: - with log_download_attempts(urls_and_md5s=urls_and_md5s, **kwargs): - dataset = dataset_loader() - except Exception: - dataset = None - - if name is None and dataset is not None: - name = type(dataset).__name__ - - return make_download_configs(urls_and_md5s, name) + return [(url, f"{dataset_cls.__name__}, {url}") for url in urls] # This is a workaround since fixtures, such as the built-in tmp_dir, can only be used within a test but not within a @@ -212,16 +180,18 @@ def collect_download_configs(dataset_loader, name=None, **kwargs): @pytest.fixture(scope="module", autouse=True) def root(): yield ROOT - dir_util.remove_tree(ROOT) + shutil.rmtree(ROOT) def places365(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.Places365(ROOT, split=split, small=small, download=True), - name=f"Places365, {split}, {'small' if small else 'large'}", - file="places365", + return itertools.chain.from_iterable( + [ + collect_urls( + datasets.Places365, + ROOT, + split=split, + small=small, + download=True, ) for split, small in itertools.product(("train-standard", "train-challenge", "val"), (False, True)) ] @@ -229,85 +199,69 @@ def places365(): def caltech101(): - return collect_download_configs(lambda: datasets.Caltech101(ROOT, download=True), name="Caltech101") + return collect_urls(datasets.Caltech101, ROOT, download=True) def caltech256(): - return collect_download_configs(lambda: datasets.Caltech256(ROOT, download=True), name="Caltech256") + return collect_urls(datasets.Caltech256, ROOT, download=True) def cifar10(): - return collect_download_configs(lambda: datasets.CIFAR10(ROOT, download=True), name="CIFAR10") + return collect_urls(datasets.CIFAR10, ROOT, download=True) def cifar100(): - return collect_download_configs(lambda: datasets.CIFAR100(ROOT, download=True), name="CIFAR100") + return collect_urls(datasets.CIFAR100, ROOT, download=True) def voc(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.VOCSegmentation(ROOT, year=year, download=True), - name=f"VOC, {year}", - file="voc", - ) - for year in ("2007", "2007-test", "2008", "2009", "2010", "2011", "2012") + # TODO: Also test the "2007-test" key + return itertools.chain.from_iterable( + [ + collect_urls(datasets.VOCSegmentation, ROOT, year=year, download=True) + for year in ("2007", "2008", "2009", "2010", "2011", "2012") ] ) def mnist(): with unittest.mock.patch.object(datasets.MNIST, "mirrors", datasets.MNIST.mirrors[-1:]): - return collect_download_configs(lambda: datasets.MNIST(ROOT, download=True), name="MNIST") + return collect_urls(datasets.MNIST, ROOT, download=True) def fashion_mnist(): - return collect_download_configs(lambda: datasets.FashionMNIST(ROOT, download=True), name="FashionMNIST") + return collect_urls(datasets.FashionMNIST, ROOT, download=True) def kmnist(): - return collect_download_configs(lambda: datasets.KMNIST(ROOT, download=True), name="KMNIST") + return collect_urls(datasets.KMNIST, ROOT, download=True) def emnist(): # the 'split' argument can be any valid one, since everything is downloaded anyway - return collect_download_configs(lambda: datasets.EMNIST(ROOT, split="byclass", download=True), name="EMNIST") + return collect_urls(datasets.EMNIST, ROOT, split="byclass", download=True) def qmnist(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.QMNIST(ROOT, what=what, download=True), - name=f"QMNIST, {what}", - file="mnist", - ) - for what in ("train", "test", "nist") - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.QMNIST, ROOT, what=what, download=True) for what in ("train", "test", "nist")] ) +def moving_mnist(): + return collect_urls(datasets.MovingMNIST, ROOT, download=True) + + def omniglot(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.Omniglot(ROOT, background=background, download=True), - name=f"Omniglot, {'background' if background else 'evaluation'}", - ) - for background in (True, False) - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.Omniglot, ROOT, background=background, download=True) for background in (True, False)] ) def phototour(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.PhotoTour(ROOT, name=name, download=True), - name=f"PhotoTour, {name}", - file="phototour", - ) + return itertools.chain.from_iterable( + [ + collect_urls(datasets.PhotoTour, ROOT, name=name, download=True) # The names postfixed with '_harris' point to the domain 'matthewalunbrown.com'. For some reason all # requests timeout from within CI. They are disabled until this is resolved. for name in ("notredame", "yosemite", "liberty") # "notredame_harris", "yosemite_harris", "liberty_harris" @@ -316,91 +270,51 @@ def phototour(): def sbdataset(): - return collect_download_configs( - lambda: datasets.SBDataset(ROOT, download=True), - name="SBDataset", - file="voc", - ) + return collect_urls(datasets.SBDataset, ROOT, download=True) def sbu(): - return collect_download_configs( - lambda: datasets.SBU(ROOT, download=True), - name="SBU", - file="sbu", - ) + return collect_urls(datasets.SBU, ROOT, download=True) def semeion(): - return collect_download_configs( - lambda: datasets.SEMEION(ROOT, download=True), - name="SEMEION", - file="semeion", - ) + return collect_urls(datasets.SEMEION, ROOT, download=True) def stl10(): - return collect_download_configs( - lambda: datasets.STL10(ROOT, download=True), - name="STL10", - ) + return collect_urls(datasets.STL10, ROOT, download=True) def svhn(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.SVHN(ROOT, split=split, download=True), - name=f"SVHN, {split}", - file="svhn", - ) - for split in ("train", "test", "extra") - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.SVHN, ROOT, split=split, download=True) for split in ("train", "test", "extra")] ) def usps(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.USPS(ROOT, train=train, download=True), - name=f"USPS, {'train' if train else 'test'}", - file="usps", - ) - for train in (True, False) - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.USPS, ROOT, train=train, download=True) for train in (True, False)] ) def celeba(): - return collect_download_configs( - lambda: datasets.CelebA(ROOT, download=True), - name="CelebA", - file="celeba", - ) + return collect_urls(datasets.CelebA, ROOT, download=True) def widerface(): - return collect_download_configs( - lambda: datasets.WIDERFace(ROOT, download=True), - name="WIDERFace", - file="widerface", - ) + return collect_urls(datasets.WIDERFace, ROOT, download=True) def kinetics(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.Kinetics( - path.join(ROOT, f"Kinetics{num_classes}"), - frames_per_clip=1, - num_classes=num_classes, - split=split, - download=True, - ), - name=f"Kinetics, {num_classes}, {split}", - file="kinetics", + return itertools.chain.from_iterable( + [ + collect_urls( + datasets.Kinetics, + path.join(ROOT, f"Kinetics{num_classes}"), + frames_per_clip=1, + num_classes=num_classes, + split=split, + download=True, ) for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val")) ] @@ -408,57 +322,49 @@ def kinetics(): def kitti(): - return itertools.chain( - *[ - collect_download_configs( - lambda train=train: datasets.Kitti(ROOT, train=train, download=True), - name=f"Kitti, {'train' if train else 'test'}", - file="kitti", - ) - for train in (True, False) - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.Kitti, ROOT, train=train, download=True) for train in (True, False)] ) -def make_parametrize_kwargs(download_configs): - argvalues = [] - ids = [] - for config in download_configs: - argvalues.append((config.url, config.md5)) - ids.append(config.id) - - return dict(argnames=("url", "md5"), argvalues=argvalues, ids=ids) - - -@pytest.mark.parametrize( - **make_parametrize_kwargs( - itertools.chain( - caltech101(), - caltech256(), - cifar10(), - cifar100(), - # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details. - # voc(), - mnist(), - fashion_mnist(), - kmnist(), - emnist(), - qmnist(), - omniglot(), - phototour(), - sbdataset(), - semeion(), - stl10(), - svhn(), - usps(), - celeba(), - widerface(), - kinetics(), - kitti(), - ) +def url_parametrization(*dataset_urls_and_ids_fns): + return pytest.mark.parametrize( + "url", + [ + pytest.param(url, id=id) + for dataset_urls_and_ids_fn in dataset_urls_and_ids_fns + for url, id in sorted(set(dataset_urls_and_ids_fn())) + ], ) + + +@url_parametrization( + caltech101, + caltech256, + cifar10, + cifar100, + # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details. + # voc, + mnist, + fashion_mnist, + kmnist, + emnist, + qmnist, + omniglot, + phototour, + sbdataset, + semeion, + stl10, + svhn, + usps, + celeba, + widerface, + kinetics, + kitti, + places365, + sbu, ) -def test_url_is_accessible(url, md5): +def test_url_is_accessible(url): """ If you see this test failing, find the offending dataset in the parametrization and move it to ``test_url_is_not_accessible`` and link an issue detailing the problem. @@ -466,16 +372,11 @@ def test_url_is_accessible(url, md5): retry(lambda: assert_url_is_accessible(url)) -@pytest.mark.parametrize( - **make_parametrize_kwargs( - itertools.chain( - places365(), # https://github.com/pytorch/vision/issues/6268 - sbu(), # https://github.com/pytorch/vision/issues/6390 - ) - ) -) +# TODO: if e.g. caltech101 starts failing, remove the pytest.mark.parametrize below and use +# @url_parametrization(caltech101) +@pytest.mark.parametrize("url", ("http://url_that_doesnt_exist.com",)) # here until we actually have a failing dataset @pytest.mark.xfail -def test_url_is_not_accessible(url, md5): +def test_url_is_not_accessible(url): """ As the name implies, this test is the 'inverse' of ``test_url_is_accessible``. Since the download servers are beyond our control, some files might not be accessible for longer stretches of time. Still, we want to know if they @@ -484,9 +385,4 @@ def test_url_is_not_accessible(url, md5): If you see this test failing, find the offending dataset in the parametrization and move it to ``test_url_is_accessible``. """ - retry(lambda: assert_url_is_accessible(url)) - - -@pytest.mark.parametrize(**make_parametrize_kwargs(itertools.chain())) -def test_file_downloads_correctly(url, md5): - retry(lambda: assert_file_downloads_correctly(url, md5)) + assert_url_is_accessible(url) diff --git a/test/test_datasets_utils.py b/test/test_datasets_utils.py index ec68fd72a5b..461688405d7 100644 --- a/test/test_datasets_utils.py +++ b/test/test_datasets_utils.py @@ -7,7 +7,9 @@ import zipfile import pytest +import torch import torchvision.datasets.utils as utils +from common_utils import assert_equal from torch._utils_internal import get_file_path_2 from torchvision.datasets.folder import make_dataset from torchvision.datasets.utils import _COMPRESSED_FILE_OPENERS @@ -56,8 +58,11 @@ def test_get_redirect_url_max_hops_exceeded(self, mocker): assert mock.call_count == 1 assert mock.call_args[0][0].full_url == url - def test_check_md5(self): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_check_md5(self, use_pathlib): fpath = TEST_FILE + if use_pathlib: + fpath = pathlib.Path(fpath) correct_md5 = "9c0bb82894bb3af7f7675ef2b3b6dcdc" false_md5 = "" assert utils.check_md5(fpath, correct_md5) @@ -114,7 +119,8 @@ def test_detect_file_type_incompatible(self, file): utils._detect_file_type(file) @pytest.mark.parametrize("extension", [".bz2", ".gz", ".xz"]) - def test_decompress(self, extension, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_decompress(self, extension, tmpdir, use_pathlib): def create_compressed(root, content="this is the content"): file = os.path.join(root, "file") compressed = f"{file}{extension}" @@ -126,6 +132,8 @@ def create_compressed(root, content="this is the content"): return compressed, file, content compressed, file, content = create_compressed(tmpdir) + if use_pathlib: + compressed = pathlib.Path(compressed) utils._decompress(compressed) @@ -138,7 +146,8 @@ def test_decompress_no_compression(self): with pytest.raises(RuntimeError): utils._decompress("foo.tar") - def test_decompress_remove_finished(self, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_decompress_remove_finished(self, tmpdir, use_pathlib): def create_compressed(root, content="this is the content"): file = os.path.join(root, "file") compressed = f"{file}.gz" @@ -149,10 +158,20 @@ def create_compressed(root, content="this is the content"): return compressed, file, content compressed, file, content = create_compressed(tmpdir) + print(f"{type(compressed)=}") + if use_pathlib: + compressed = pathlib.Path(compressed) + tmpdir = pathlib.Path(tmpdir) - utils.extract_archive(compressed, tmpdir, remove_finished=True) + extracted_dir = utils.extract_archive(compressed, tmpdir, remove_finished=True) assert not os.path.exists(compressed) + if use_pathlib: + assert isinstance(extracted_dir, pathlib.Path) + assert isinstance(compressed, pathlib.Path) + else: + assert isinstance(extracted_dir, str) + assert isinstance(compressed, str) @pytest.mark.parametrize("extension", [".gz", ".xz"]) @pytest.mark.parametrize("remove_finished", [True, False]) @@ -165,7 +184,8 @@ def test_extract_archive_defer_to_decompress(self, extension, remove_finished, m mocked.assert_called_once_with(file, filename, remove_finished=remove_finished) - def test_extract_zip(self, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_extract_zip(self, tmpdir, use_pathlib): def create_archive(root, content="this is the content"): file = os.path.join(root, "dst.txt") archive = os.path.join(root, "archive.zip") @@ -175,6 +195,8 @@ def create_archive(root, content="this is the content"): return archive, file, content + if use_pathlib: + tmpdir = pathlib.Path(tmpdir) archive, file, content = create_archive(tmpdir) utils.extract_archive(archive, tmpdir) @@ -187,7 +209,8 @@ def create_archive(root, content="this is the content"): @pytest.mark.parametrize( "extension, mode", [(".tar", "w"), (".tar.gz", "w:gz"), (".tgz", "w:gz"), (".tar.xz", "w:xz")] ) - def test_extract_tar(self, extension, mode, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_extract_tar(self, extension, mode, tmpdir, use_pathlib): def create_archive(root, extension, mode, content="this is the content"): src = os.path.join(root, "src.txt") dst = os.path.join(root, "dst.txt") @@ -201,6 +224,8 @@ def create_archive(root, extension, mode, content="this is the content"): return archive, dst, content + if use_pathlib: + tmpdir = pathlib.Path(tmpdir) archive, file, content = create_archive(tmpdir, extension, mode) utils.extract_archive(archive, tmpdir) @@ -215,6 +240,24 @@ def test_verify_str_arg(self): pytest.raises(ValueError, utils.verify_str_arg, 0, ("a",), "arg") pytest.raises(ValueError, utils.verify_str_arg, "b", ("a",), "arg") + @pytest.mark.parametrize( + ("dtype", "actual_hex", "expected_hex"), + [ + (torch.uint8, "01 23 45 67 89 AB CD EF", "01 23 45 67 89 AB CD EF"), + (torch.float16, "01 23 45 67 89 AB CD EF", "23 01 67 45 AB 89 EF CD"), + (torch.int32, "01 23 45 67 89 AB CD EF", "67 45 23 01 EF CD AB 89"), + (torch.float64, "01 23 45 67 89 AB CD EF", "EF CD AB 89 67 45 23 01"), + ], + ) + def test_flip_byte_order(self, dtype, actual_hex, expected_hex): + def to_tensor(hex): + return torch.frombuffer(bytes.fromhex(hex), dtype=dtype) + + assert_equal( + utils._flip_byte_order(to_tensor(actual_hex)), + to_tensor(expected_hex), + ) + @pytest.mark.parametrize( ("kwargs", "expected_error_msg"), diff --git a/test/test_datasets_video_utils.py b/test/test_datasets_video_utils.py index adaa4f5446c..51330911e50 100644 --- a/test/test_datasets_video_utils.py +++ b/test/test_datasets_video_utils.py @@ -60,7 +60,7 @@ def test_video_clips_custom_fps(self, tmpdir): video_list = get_list_of_videos(tmpdir, num_videos=3, sizes=[12, 12, 12], fps=[3, 4, 6]) num_frames = 4 for fps in [1, 3, 4, 10]: - video_clips = VideoClips(video_list, num_frames, num_frames, fps, num_workers=2) + video_clips = VideoClips(video_list, num_frames, num_frames, fps) for i in range(video_clips.num_clips()): video, audio, info, video_idx = video_clips.get_clip(i) assert video.shape[0] == num_frames diff --git a/test/test_extended_models.py b/test/test_extended_models.py index 55259bb150d..a9072826724 100644 --- a/test/test_extended_models.py +++ b/test/test_extended_models.py @@ -1,12 +1,15 @@ +import copy import os +import pickle import pytest import test_models as TM import torch +from common_extended_utils import get_file_size_mb, get_ops from torchvision import models -from torchvision.models._api import get_model_weights, Weights, WeightsEnum +from torchvision.models import get_model_weights, Weights, WeightsEnum from torchvision.models._utils import handle_legacy_interface - +from torchvision.models.detection.backbone_utils import mobilenet_backbone, resnet_fpn_backbone run_if_test_with_extended = pytest.mark.skipif( os.getenv("PYTORCH_TEST_WITH_EXTENDED", "0") != "1", @@ -29,6 +32,21 @@ def test_get_model(name, model_class): assert isinstance(models.get_model(name), model_class) +@pytest.mark.parametrize( + "name, model_fn", + [ + ("resnet50", models.resnet50), + ("retinanet_resnet50_fpn_v2", models.detection.retinanet_resnet50_fpn_v2), + ("raft_large", models.optical_flow.raft_large), + ("quantized_resnet50", models.quantization.resnet50), + ("lraspp_mobilenet_v3_large", models.segmentation.lraspp_mobilenet_v3_large), + ("mvit_v1_b", models.video.mvit_v1_b), + ], +) +def test_get_model_builder(name, model_fn): + assert models.get_model_builder(name) == model_fn + + @pytest.mark.parametrize( "name, weight", [ @@ -44,24 +62,125 @@ def test_get_model_weights(name, weight): assert models.get_model_weights(name) == weight +@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy]) +@pytest.mark.parametrize( + "name", + [ + "resnet50", + "retinanet_resnet50_fpn_v2", + "raft_large", + "quantized_resnet50", + "lraspp_mobilenet_v3_large", + "mvit_v1_b", + ], +) +def test_weights_copyable(copy_fn, name): + for weights in list(models.get_model_weights(name)): + # It is somewhat surprising that (deep-)copying is an identity operation here, but this is the default behavior + # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances + # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop + # support for the identity operation in the future. + assert copy_fn(weights) is weights + + +@pytest.mark.parametrize( + "name", + [ + "resnet50", + "retinanet_resnet50_fpn_v2", + "raft_large", + "quantized_resnet50", + "lraspp_mobilenet_v3_large", + "mvit_v1_b", + ], +) +def test_weights_deserializable(name): + for weights in list(models.get_model_weights(name)): + # It is somewhat surprising that deserialization is an identity operation here, but this is the default behavior + # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances + # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop + # support for the identity operation in the future. + assert pickle.loads(pickle.dumps(weights)) is weights + + +def get_models_from_module(module): + return [ + v.__name__ + for k, v in module.__dict__.items() + if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__ + ] + + @pytest.mark.parametrize( "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow] ) def test_list_models(module): - def get_models_from_module(module): - return [ - v.__name__ - for k, v in module.__dict__.items() - if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__ - ] - a = set(get_models_from_module(module)) - b = set(x.replace("quantized_", "") for x in models.list_models(module)) + b = {x.replace("quantized_", "") for x in models.list_models(module)} assert len(b) > 0 assert a == b +@pytest.mark.parametrize( + "include_filters", + [ + None, + [], + (), + "", + "*resnet*", + ["*alexnet*"], + "*not-existing-model-for-test?", + ["*resnet*", "*alexnet*"], + ["*resnet*", "*alexnet*", "*not-existing-model-for-test?"], + ("*resnet*", "*alexnet*"), + {"*resnet*", "*alexnet*"}, + ], +) +@pytest.mark.parametrize( + "exclude_filters", + [ + None, + [], + (), + "", + "*resnet*", + ["*alexnet*"], + ["*not-existing-model-for-test?"], + ["resnet34", "*not-existing-model-for-test?"], + ["resnet34", "*resnet1*"], + ("resnet34", "*resnet1*"), + {"resnet34", "*resnet1*"}, + ], +) +def test_list_models_filters(include_filters, exclude_filters): + actual = set(models.list_models(models, include=include_filters, exclude=exclude_filters)) + classification_models = set(get_models_from_module(models)) + + if isinstance(include_filters, str): + include_filters = [include_filters] + if isinstance(exclude_filters, str): + exclude_filters = [exclude_filters] + + if include_filters: + expected = set() + for include_f in include_filters: + include_f = include_f.strip("*?") + expected = expected | {x for x in classification_models if include_f in x} + else: + expected = classification_models + + if exclude_filters: + for exclude_f in exclude_filters: + exclude_f = exclude_f.strip("*?") + if exclude_f != "": + a_exclude = {x for x in classification_models if exclude_f in x} + expected = expected - a_exclude + + assert expected == actual + + @pytest.mark.parametrize( "name, weight", [ @@ -96,6 +215,22 @@ def test_naming_conventions(model_fn): assert len(weights_enum) == 0 or hasattr(weights_enum, "DEFAULT") +detection_models_input_dims = { + "fasterrcnn_mobilenet_v3_large_320_fpn": (320, 320), + "fasterrcnn_mobilenet_v3_large_fpn": (800, 800), + "fasterrcnn_resnet50_fpn": (800, 800), + "fasterrcnn_resnet50_fpn_v2": (800, 800), + "fcos_resnet50_fpn": (800, 800), + "keypointrcnn_resnet50_fpn": (1333, 1333), + "maskrcnn_resnet50_fpn": (800, 800), + "maskrcnn_resnet50_fpn_v2": (800, 800), + "retinanet_resnet50_fpn": (800, 800), + "retinanet_resnet50_fpn_v2": (800, 800), + "ssd300_vgg16": (300, 300), + "ssdlite320_mobilenet_v3_large": (320, 320), +} + + @pytest.mark.parametrize( "model_fn", TM.list_model_fns(models) @@ -107,6 +242,9 @@ def test_naming_conventions(model_fn): ) @run_if_test_with_extended def test_schema_meta_validation(model_fn): + if model_fn.__name__ == "maskrcnn_resnet50_fpn_v2": + pytest.skip(reason="FIXME https://github.com/pytorch/vision/issues/7349") + # list of all possible supported high-level fields for weights meta-data permitted_fields = { "backend", @@ -120,11 +258,13 @@ def test_schema_meta_validation(model_fn): "recipe", "unquantized", "_docs", + "_ops", + "_file_size", } # mandatory fields for each computer vision task classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")} defaults = { - "all": {"_metrics", "min_size", "num_params", "recipe", "_docs"}, + "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_file_size", "_ops"}, "models": classification_fields, "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")}, "quantization": classification_fields | {"backend", "unquantized"}, @@ -145,37 +285,60 @@ def test_schema_meta_validation(model_fn): pytest.skip(f"Model '{model_name}' doesn't have any pre-trained weights.") problematic_weights = {} - incorrect_params = [] + incorrect_meta = [] bad_names = [] for w in weights_enum: actual_fields = set(w.meta.keys()) - actual_fields |= set( + actual_fields |= { ("_metrics", dataset, metric_key) for dataset in w.meta.get("_metrics", {}).keys() for metric_key in w.meta.get("_metrics", {}).get(dataset, {}).keys() - ) + } missing_fields = expected_fields - actual_fields unsupported_fields = set(w.meta.keys()) - permitted_fields if missing_fields or unsupported_fields: problematic_weights[w] = {"missing": missing_fields, "unsupported": unsupported_fields} - if w == weights_enum.DEFAULT: + + if w == weights_enum.DEFAULT or any(w.meta[k] != weights_enum.DEFAULT.meta[k] for k in ["num_params", "_ops"]): if module_name == "quantization": # parameters() count doesn't work well with quantization, so we check against the non-quantized unquantized_w = w.meta.get("unquantized") - if unquantized_w is not None and w.meta.get("num_params") != unquantized_w.meta.get("num_params"): - incorrect_params.append(w) + if unquantized_w is not None: + if w.meta.get("num_params") != unquantized_w.meta.get("num_params"): + incorrect_meta.append((w, "num_params")) + + # the methodology for quantized ops count doesn't work as well, so we take unquantized FLOPs + # instead + if w.meta["_ops"] != unquantized_w.meta.get("_ops"): + incorrect_meta.append((w, "_ops")) + else: - if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()): - incorrect_params.append(w) - else: - if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"): - if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()): - incorrect_params.append(w) + # loading the model and using it for parameter and ops verification + model = model_fn(weights=w) + + if w.meta.get("num_params") != sum(p.numel() for p in model.parameters()): + incorrect_meta.append((w, "num_params")) + + kwargs = {} + if model_name in detection_models_input_dims: + # detection models have non default height and width + height, width = detection_models_input_dims[model_name] + kwargs = {"height": height, "width": width} + + if not model_fn.__name__.startswith("vit"): + # FIXME: https://github.com/pytorch/vision/issues/7871 + calculated_ops = get_ops(model=model, weight=w, **kwargs) + if calculated_ops != w.meta["_ops"]: + incorrect_meta.append((w, "_ops")) + if not w.name.isupper(): bad_names.append(w) + if get_file_size_mb(w) != w.meta.get("_file_size"): + incorrect_meta.append((w, "_file_size")) + assert not problematic_weights - assert not incorrect_params + assert not incorrect_meta assert not bad_names @@ -320,3 +483,21 @@ def builder(*, weights=None, flag): with pytest.raises(ValueError, match="weights"): builder(pretrained=True, flag=False) + + @pytest.mark.parametrize( + "model_fn", + [fn for fn in TM.list_model_fns(models) if fn.__name__ not in {"vit_h_14", "regnet_y_128gf"}] + + TM.list_model_fns(models.detection) + + TM.list_model_fns(models.quantization) + + TM.list_model_fns(models.segmentation) + + TM.list_model_fns(models.video) + + TM.list_model_fns(models.optical_flow) + + [ + lambda pretrained: resnet_fpn_backbone(backbone_name="resnet50", pretrained=pretrained), + lambda pretrained: mobilenet_backbone(backbone_name="mobilenet_v2", fpn=False, pretrained=pretrained), + ], + ) + @run_if_test_with_extended + def test_pretrained_deprecation(self, model_fn): + with pytest.warns(UserWarning, match="deprecated"): + model_fn(pretrained=True) diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index 1914bc571fb..7d491372b77 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -2,17 +2,17 @@ import itertools import math import os -import re +from collections.abc import Sequence from functools import partial -from typing import Sequence import numpy as np +import PIL.Image import pytest import torch import torchvision.transforms as T +import torchvision.transforms._functional_pil as F_pil +import torchvision.transforms._functional_tensor as F_t import torchvision.transforms.functional as F -import torchvision.transforms.functional_pil as F_pil -import torchvision.transforms.functional_tensor as F_t from common_utils import ( _assert_approx_equal_tensor_to_pil, _assert_equal_tensor_to_pil, @@ -20,15 +20,20 @@ _create_data_batch, _test_fn_on_batch, assert_equal, - cpu_and_gpu, + cpu_and_cuda, needs_cuda, ) from torchvision.transforms import InterpolationMode -NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC +NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = ( + InterpolationMode.NEAREST, + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR, + InterpolationMode.BICUBIC, +) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("fn", [F.get_image_size, F.get_image_num_channels, F.get_dimensions]) def test_image_sizes(device, fn): script_F = torch.jit.script(fn) @@ -66,7 +71,7 @@ class TestRotate: scripted_rotate = torch.jit.script(F.rotate) IMG_W = 26 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(7, 33), (26, IMG_W), (32, IMG_W)]) @pytest.mark.parametrize( "center", @@ -125,7 +130,7 @@ def test_rotate(self, device, height, width, center, dt, angle, expand, fill, fn f"{out_pil_tensor[0, :7, :7]}" ) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", ALL_DTYPES) def test_rotate_batch(self, device, dt): if dt == torch.float16 and device == "cpu": @@ -139,33 +144,11 @@ def test_rotate_batch(self, device, dt): center = (20, 22) _test_fn_on_batch(batch_tensors, F.rotate, angle=32, interpolation=NEAREST, expand=True, center=center) - def test_rotate_deprecation_resample(self): - tensor, _ = _create_data(26, 26) - # assert deprecation warning and non-BC - with pytest.warns( - UserWarning, - match=re.escape( - "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. " - "Please use 'interpolation' instead." - ), - ): - res1 = F.rotate(tensor, 45, resample=2) - res2 = F.rotate(tensor, 45, interpolation=BILINEAR) - assert_equal(res1, res2) - def test_rotate_interpolation_type(self): tensor, _ = _create_data(26, 26) - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.rotate(tensor, 45, interpolation=2) - res2 = F.rotate(tensor, 45, interpolation=BILINEAR) - assert_equal(res1, res2) + res1 = F.rotate(tensor, 45, interpolation=PIL.Image.BILINEAR) + res2 = F.rotate(tensor, 45, interpolation=BILINEAR) + assert_equal(res1, res2) class TestAffine: @@ -173,7 +156,7 @@ class TestAffine: ALL_DTYPES = [None, torch.float32, torch.float64, torch.float16] scripted_affine = torch.jit.script(F.affine) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) def test_identity_map(self, device, height, width, dt): @@ -196,7 +179,7 @@ def test_identity_map(self, device, height, width, dt): ) assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}") - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize( @@ -240,7 +223,7 @@ def test_square_rotations(self, device, height, width, dt, angle, config, fn): # Tolerance : less than 6% of different pixels assert ratio_diff_pixels < 0.06 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize("angle", [90, 45, 15, -30, -60, -120]) @@ -274,7 +257,7 @@ def test_rect_rotations(self, device, height, width, dt, angle, fn, center): # Tolerance : less than 3% of different pixels assert ratio_diff_pixels < 0.03 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize("t", [[10, 12], (-12, -13)]) @@ -299,7 +282,7 @@ def test_translations(self, device, height, width, dt, t, fn): _assert_equal_tensor_to_pil(out_tensor, out_pil_img) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize( @@ -309,24 +292,8 @@ def test_translations(self, device, height, width, dt, t, fn): (33, (5, -4), 1.0, [0.0, 0.0], [0, 0, 0]), (45, [-5, 4], 1.2, [0.0, 0.0], (1, 2, 3)), (33, (-4, -8), 2.0, [0.0, 0.0], [255, 255, 255]), - ( - 85, - (10, -10), - 0.7, - [0.0, 0.0], - [ - 1, - ], - ), - ( - 0, - [0, 0], - 1.0, - [ - 35.0, - ], - (2.0,), - ), + (85, (10, -10), 0.7, [0.0, 0.0], [1]), + (0, [0, 0], 1.0, [35.0], (2.0,)), (-25, [0, 0], 1.2, [0.0, 15.0], None), (-45, [-10, 0], 0.7, [2.0, 5.0], None), (-45, [-10, -10], 1.2, [4.0, 5.0], None), @@ -360,7 +327,7 @@ def test_all_ops(self, device, height, width, dt, a, t, s, sh, f, fn): tol = 0.06 if device == "cuda" else 0.05 assert ratio_diff_pixels < tol - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", ALL_DTYPES) def test_batches(self, device, dt): if dt == torch.float16 and device == "cpu": @@ -373,45 +340,13 @@ def test_batches(self, device, dt): _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0]) - @pytest.mark.parametrize("device", cpu_and_gpu()) - def test_warnings(self, device): + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_interpolation_type(self, device): tensor, pil_img = _create_data(26, 26, device=device) - # assert deprecation warning and non-BC - with pytest.warns( - UserWarning, - match=re.escape( - "The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. " - "Please use 'interpolation' instead." - ), - ): - res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], resample=2) - res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR) - assert_equal(res1, res2) - - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=2) - res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR) - assert_equal(res1, res2) - - with pytest.warns( - UserWarning, - match=re.escape( - "The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. " - "Please use 'fill' instead." - ), - ): - res1 = F.affine(pil_img, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], fillcolor=10) - res2 = F.affine(pil_img, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], fill=10) - # we convert the PIL images to numpy as assert_equal doesn't work on PIL images. - assert_equal(np.asarray(res1), np.asarray(res2)) + res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=PIL.Image.BILINEAR) + res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR) + assert_equal(res1, res2) def _get_data_dims_and_points_for_perspective(): @@ -437,22 +372,10 @@ def _get_data_dims_and_points_for_perspective(): return dims_and_points -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize( - "fill", - ( - None, - [0, 0, 0], - [1, 2, 3], - [255, 255, 255], - [ - 1, - ], - (2.0,), - ), -) +@pytest.mark.parametrize("fill", (None, [0, 0, 0], [1, 2, 3], [255, 255, 255], [1], (2.0,))) @pytest.mark.parametrize("fn", [F.perspective, torch.jit.script(F.perspective)]) def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn): @@ -483,7 +406,7 @@ def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn): assert ratio_diff_pixels < 0.05 -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) def test_perspective_batch(device, dims_and_points, dt): @@ -511,40 +434,21 @@ def test_perspective_batch(device, dims_and_points, dt): ) -def test_perspective_interpolation_warning(): - # assert changed type warning +def test_perspective_interpolation_type(): spoints = [[0, 0], [33, 0], [33, 25], [0, 25]] epoints = [[3, 2], [32, 3], [30, 24], [2, 25]] tensor = torch.randint(0, 256, (3, 26, 26)) - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=2) - res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR) - assert_equal(res1, res2) + + res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=PIL.Image.BILINEAR) + res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR) + assert_equal(res1, res2) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize( - "size", - [ - 32, - 26, - [ - 32, - ], - [32, 32], - (32, 32), - [26, 35], - ], -) +@pytest.mark.parametrize("size", [32, 26, [32], [32, 32], (32, 32), [26, 35]]) @pytest.mark.parametrize("max_size", [None, 34, 40, 1000]) -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST]) +@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT]) def test_resize(device, dt, size, max_size, interpolation): if dt == torch.float16 and device == "cpu": @@ -564,14 +468,12 @@ def test_resize(device, dt, size, max_size, interpolation): tensor = tensor.to(dt) batch_tensors = batch_tensors.to(dt) - resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size) - resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size) + resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size, antialias=True) + resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size, antialias=True) assert resized_tensor.size()[1:] == resized_pil_img.size[::-1] - if interpolation not in [ - NEAREST, - ]: + if interpolation != NEAREST: # We can not check values if mode = NEAREST, as results are different # E.g. resized_tensor = [[a, a, b, c, d, d, e, ...]] # E.g. resized_pil_img = [[a, b, c, c, d, e, f, ...]] @@ -581,36 +483,27 @@ def test_resize(device, dt, size, max_size, interpolation): resized_tensor_f = resized_tensor_f.to(torch.float) # Pay attention to high tolerance for MAE - _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=8.0) + _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=3.0) if isinstance(size, int): - script_size = [ - size, - ] + script_size = [size] else: script_size = size - resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size) + resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True) assert_equal(resized_tensor, resize_result) - _test_fn_on_batch(batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size) + _test_fn_on_batch( + batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True + ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_resize_asserts(device): tensor, pil_img = _create_data(26, 36, device=device) - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.resize(tensor, size=32, interpolation=2) - + res1 = F.resize(tensor, size=32, interpolation=PIL.Image.BILINEAR) res2 = F.resize(tensor, size=32, interpolation=BILINEAR) assert_equal(res1, res2) @@ -622,7 +515,7 @@ def test_resize_asserts(device): F.resize(img, size=32, max_size=32) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("size", [[96, 72], [96, 420], [420, 72]]) @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC]) @@ -641,7 +534,7 @@ def test_resize_antialias(device, dt, size, interpolation): tensor = tensor.to(dt) resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, antialias=True) - resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation) + resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, antialias=True) assert resized_tensor.size()[1:] == resized_pil_img.size[::-1] @@ -675,40 +568,6 @@ def test_resize_antialias(device, dt, size, interpolation): assert_equal(resized_tensor, resize_result) -@needs_cuda -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC]) -def test_assert_resize_antialias(interpolation): - - # Checks implementation on very large scales - # and catch TORCH_CHECK inside PyTorch implementation - torch.manual_seed(12) - tensor, _ = _create_data(1000, 1000, device="cuda") - - # Error message is not yet updated in pytorch nightly - # with pytest.raises(RuntimeError, match=r"Provided interpolation parameters can not be handled"): - with pytest.raises(RuntimeError, match=r"Too much shared memory required"): - F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("dt", [torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize("size", [[10, 7], [10, 42], [42, 7]]) -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC]) -def test_interpolate_antialias_backward(device, dt, size, interpolation): - - if dt == torch.float16 and device == "cpu": - # skip float16 on CPU case - return - - torch.manual_seed(12) - x = (torch.rand(1, 32, 29, 3, dtype=torch.double, device=device).permute(0, 3, 1, 2).requires_grad_(True),) - resize = partial(F.resize, size=size, interpolation=interpolation, antialias=True) - assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False) - - x = (torch.rand(1, 3, 32, 29, dtype=torch.double, device=device, requires_grad=True),) - assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False) - - def check_functional_vs_PIL_vs_scripted( fn, fn_pil, fn_t, config, device, dtype, channels=3, tol=2.0 + 1e-10, agg_method="max" ): @@ -746,7 +605,7 @@ def check_functional_vs_PIL_vs_scripted( _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=atol, **config) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"brightness_factor": f} for f in (0.1, 0.5, 1.0, 1.34, 2.5)]) @pytest.mark.parametrize("channels", [1, 3]) @@ -762,7 +621,7 @@ def test_adjust_brightness(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("channels", [1, 3]) def test_invert(device, dtype, channels): @@ -771,7 +630,7 @@ def test_invert(device, dtype, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("config", [{"bits": bits} for bits in range(0, 8)]) @pytest.mark.parametrize("channels", [1, 3]) def test_posterize(device, config, channels): @@ -788,7 +647,7 @@ def test_posterize(device, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0, 64, 128, 192, 255]]) @pytest.mark.parametrize("channels", [1, 3]) def test_solarize1(device, config, channels): @@ -805,7 +664,7 @@ def test_solarize1(device, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0.0, 0.25, 0.5, 0.75, 1.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -823,37 +682,45 @@ def test_solarize2(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [0.0, 0.25, 0.5, 0.75, 1.0]) -def test_solarize_threshold1_bound(threshold, device): - img = torch.rand((3, 12, 23)).to(device) - F_t.solarize(img, threshold) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [1.5]) -def test_solarize_threshold1_upper_bound(threshold, device): - img = torch.rand((3, 12, 23)).to(device) - with pytest.raises(TypeError, match="Threshold should be less than bound of img."): - F_t.solarize(img, threshold) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [0, 64, 128, 192, 255]) -def test_solarize_threshold2_bound(threshold, device): - img = torch.randint(0, 256, (3, 12, 23)).to(device) +@pytest.mark.parametrize( + ("dtype", "threshold"), + [ + *[ + (dtype, threshold) + for dtype, threshold in itertools.product( + [torch.float32, torch.float16], + [0.0, 0.25, 0.5, 0.75, 1.0], + ) + ], + *[(torch.uint8, threshold) for threshold in [0, 64, 128, 192, 255]], + *[(torch.int64, threshold) for threshold in [0, 2**32, 2**63 - 1]], + ], +) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_solarize_threshold_within_bound(threshold, dtype, device): + make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max) + img = make_img((3, 12, 23), dtype=dtype, device=device) F_t.solarize(img, threshold) -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [260]) -def test_solarize_threshold2_upper_bound(threshold, device): - img = torch.randint(0, 256, (3, 12, 23)).to(device) +@pytest.mark.parametrize( + ("dtype", "threshold"), + [ + (torch.float32, 1.5), + (torch.float16, 1.5), + (torch.uint8, 260), + (torch.int64, 2**64), + ], +) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_solarize_threshold_above_bound(threshold, dtype, device): + make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max) + img = make_img((3, 12, 23), dtype=dtype, device=device) with pytest.raises(TypeError, match="Threshold should be less than bound of img."): F_t.solarize(img, threshold) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"sharpness_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -869,7 +736,7 @@ def test_adjust_sharpness(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("channels", [1, 3]) def test_autocontrast(device, dtype, channels): @@ -878,7 +745,7 @@ def test_autocontrast(device, dtype, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("channels", [1, 3]) def test_autocontrast_equal_minmax(device, dtype, channels): @@ -890,7 +757,7 @@ def test_autocontrast_equal_minmax(device, dtype, channels): assert (F.autocontrast(a)[0] == F.autocontrast(a[0])).all() -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("channels", [1, 3]) def test_equalize(device, channels): torch.use_deterministic_algorithms(False) @@ -907,7 +774,7 @@ def test_equalize(device, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"contrast_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -917,7 +784,7 @@ def test_adjust_contrast(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"saturation_factor": f} for f in [0.5, 0.75, 1.0, 1.5, 2.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -927,7 +794,7 @@ def test_adjust_saturation(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"hue_factor": f} for f in [-0.45, -0.25, 0.0, 0.25, 0.45]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -937,7 +804,7 @@ def test_adjust_hue(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"gamma": g1, "gain": g2} for g1, g2 in zip([0.8, 1.0, 1.2], [0.7, 1.0, 1.3])]) @pytest.mark.parametrize("channels", [1, 3]) @@ -953,7 +820,7 @@ def test_adjust_gamma(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("pad", [2, [3], [0, 3], (3, 3), [4, 2, 4, 3]]) @pytest.mark.parametrize( @@ -1003,14 +870,16 @@ def test_pad(device, dt, pad, config): _test_fn_on_batch(batch_tensors, F.pad, padding=script_pad, **config) -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("mode", [NEAREST, BILINEAR, BICUBIC]) +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC]) def test_resized_crop(device, mode): # test values of F.resized_crop in several cases: # 1) resize to the same size, crop to the same size => should be identity tensor, _ = _create_data(26, 36, device=device) - out_tensor = F.resized_crop(tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode) + out_tensor = F.resized_crop( + tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode, antialias=True + ) assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}") # 2) resize by half and crop a TL corner @@ -1025,11 +894,18 @@ def test_resized_crop(device, mode): batch_tensors = _create_data_batch(26, 36, num_samples=4, device=device) _test_fn_on_batch( - batch_tensors, F.resized_crop, top=1, left=2, height=20, width=30, size=[10, 15], interpolation=NEAREST + batch_tensors, + F.resized_crop, + top=1, + left=2, + height=20, + width=30, + size=[10, 15], + interpolation=NEAREST, ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "func, args", [ @@ -1062,7 +938,7 @@ def test_assert_image_tensor(device, func, args): func(tensor, *args) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_vflip(device): script_vflip = torch.jit.script(F.vflip) @@ -1079,7 +955,7 @@ def test_vflip(device): _test_fn_on_batch(batch_tensors, F.vflip) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_hflip(device): script_hflip = torch.jit.script(F.hflip) @@ -1096,7 +972,7 @@ def test_hflip(device): _test_fn_on_batch(batch_tensors, F.hflip) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "top, left, height, width", [ @@ -1104,6 +980,8 @@ def test_hflip(device): (2, 12, 3, 4), # crop inside top-right corner (8, 3, 5, 6), # crop inside bottom-left corner (8, 11, 4, 3), # crop inside bottom-right corner + (50, 50, 10, 10), # crop outside the image + (-50, -50, 10, 10), # crop outside the image ], ) def test_crop(device, top, left, height, width): @@ -1123,7 +1001,7 @@ def test_crop(device, top, left, height, width): _test_fn_on_batch(batch_tensors, F.crop, top=top, left=left, height=height, width=width) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("image_size", ("small", "large")) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) @@ -1146,7 +1024,8 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn): # "23_23_1.7": ... # } p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") - true_cv2_results = torch.load(p) + + true_cv2_results = torch.load(p, weights_only=False) if image_size == "small": tensor = ( @@ -1177,7 +1056,7 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn): torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}") -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_hsv2rgb(device): scripted_fn = torch.jit.script(F_t._hsv2rgb) shape = (3, 100, 150) @@ -1208,7 +1087,7 @@ def test_hsv2rgb(device): _test_fn_on_batch(batch_tensors, F_t._hsv2rgb) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_rgb2hsv(device): scripted_fn = torch.jit.script(F_t._rgb2hsv) shape = (3, 150, 100) @@ -1247,7 +1126,7 @@ def test_rgb2hsv(device): _test_fn_on_batch(batch_tensors, F_t._rgb2hsv) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("num_output_channels", (3, 1)) def test_rgb_to_grayscale(device, num_output_channels): script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale) @@ -1266,7 +1145,7 @@ def test_rgb_to_grayscale(device, num_output_channels): _test_fn_on_batch(batch_tensors, F.rgb_to_grayscale, num_output_channels=num_output_channels) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_center_crop(device): script_center_crop = torch.jit.script(F.center_crop) @@ -1284,7 +1163,7 @@ def test_center_crop(device): _test_fn_on_batch(batch_tensors, F.center_crop, output_size=[10, 11]) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_five_crop(device): script_five_crop = torch.jit.script(F.five_crop) @@ -1318,7 +1197,7 @@ def test_five_crop(device): assert_equal(transformed_batch, s_transformed_batch) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_ten_crop(device): script_ten_crop = torch.jit.script(F.ten_crop) @@ -1364,7 +1243,7 @@ def test_elastic_transform_asserts(): _ = F.elastic_transform(img_tensor, displacement=torch.rand(1, 2)) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC]) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize( diff --git a/test/test_hub.py b/test/test_hub.py deleted file mode 100644 index d88c6fa2cd2..00000000000 --- a/test/test_hub.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import shutil -import sys -import tempfile - -import pytest -import torch.hub as hub - - -def sum_of_model_parameters(model): - s = 0 - for p in model.parameters(): - s += p.sum() - return s - - -SUM_OF_PRETRAINED_RESNET18_PARAMS = -12703.9931640625 - - -@pytest.mark.skipif("torchvision" in sys.modules, reason="TestHub must start without torchvision imported") -class TestHub: - # Only run this check ONCE before all tests start. - # - If torchvision is imported before all tests start, e.g. we might find _C.so - # which doesn't exist in downloaded zip but in the installed wheel. - # - After the first test is run, torchvision is already in sys.modules due to - # Python cache as we run all hub tests in the same python process. - - def test_load_from_github(self): - hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False) - assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS) - - def test_set_dir(self): - temp_dir = tempfile.gettempdir() - hub.set_dir(temp_dir) - hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False) - assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS) - assert os.path.exists(temp_dir + "/pytorch_vision_master") - shutil.rmtree(temp_dir + "/pytorch_vision_master") - - def test_list_entrypoints(self): - entry_lists = hub.list("pytorch/vision", force_reload=True) - assert "resnet18" in entry_lists - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/test_image.py b/test/test_image.py index 7fcd54c9c8f..b11dd67ca12 100644 --- a/test/test_image.py +++ b/test/test_image.py @@ -1,20 +1,27 @@ +import concurrent.futures +import contextlib import glob import io import os +import re import sys from pathlib import Path import numpy as np import pytest +import requests import torch -import torchvision.transforms.functional as F -from common_utils import assert_equal, needs_cuda -from PIL import __version__ as PILLOW_VERSION, Image +import torchvision.transforms.v2.functional as F +from common_utils import assert_equal, cpu_and_cuda, IN_OSS_CI, needs_cuda +from PIL import __version__ as PILLOW_VERSION, Image, ImageOps, ImageSequence from torchvision.io.image import ( - _read_png_16, + decode_avif, + decode_gif, + decode_heic, decode_image, decode_jpeg, decode_png, + decode_webp, encode_jpeg, encode_png, ImageReadMode, @@ -32,8 +39,14 @@ DAMAGED_PNG = os.path.join(IMAGE_ROOT, "damaged_png") ENCODE_JPEG = os.path.join(IMAGE_ROOT, "encode_jpeg") INTERLACED_PNG = os.path.join(IMAGE_ROOT, "interlaced_png") +TOOSMALL_PNG = os.path.join(IMAGE_ROOT, "toosmall_png") IS_WINDOWS = sys.platform in ("win32", "cygwin") +IS_MACOS = sys.platform == "darwin" +IS_LINUX = sys.platform == "linux" PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split(".")) +WEBP_TEST_IMAGES_DIR = os.environ.get("WEBP_TEST_IMAGES_DIR", "") +# See https://github.com/pytorch/vision/pull/8724#issuecomment-2503964558 +HEIC_AVIF_MESSAGE = "AVIF and HEIF only available on linux." def _get_safe_image_name(name): @@ -77,23 +90,24 @@ def normalize_dimensions(img_pil): ("RGB", ImageReadMode.RGB), ], ) -def test_decode_jpeg(img_path, pil_mode, mode): +@pytest.mark.parametrize("scripted", (False, True)) +@pytest.mark.parametrize("decode_fun", (decode_jpeg, decode_image)) +def test_decode_jpeg(img_path, pil_mode, mode, scripted, decode_fun): with Image.open(img_path) as img: is_cmyk = img.mode == "CMYK" if pil_mode is not None: - if is_cmyk: - # libjpeg does not support the conversion - pytest.xfail("Decoding a CMYK jpeg isn't supported") img = img.convert(pil_mode) img_pil = torch.from_numpy(np.array(img)) - if is_cmyk: + if is_cmyk and mode == ImageReadMode.UNCHANGED: # flip the colors to match libjpeg img_pil = 255 - img_pil img_pil = normalize_dimensions(img_pil) data = read_file(img_path) - img_ljpeg = decode_image(data, mode=mode) + if scripted: + decode_fun = torch.jit.script(decode_fun) + img_ljpeg = decode_fun(data, mode=mode) # Permit a small variation on pixel values to account for implementation # differences between Pillow and LibJPEG. @@ -101,15 +115,43 @@ def test_decode_jpeg(img_path, pil_mode, mode): assert abs_mean_diff < 2 -def test_decode_jpeg_errors(): - with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"): - decode_jpeg(torch.empty((100, 1), dtype=torch.uint8)) +@pytest.mark.parametrize("codec", ["png", "jpeg"]) +@pytest.mark.parametrize("orientation", [1, 2, 3, 4, 5, 6, 7, 8, 0]) +def test_decode_with_exif_orientation(tmpdir, codec, orientation): + fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.{codec}") + t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8) + im = F.to_pil_image(t) + exif = im.getexif() + exif[0x0112] = orientation # set exif orientation + im.save(fp, codec.upper(), exif=exif.tobytes()) - with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"): - decode_jpeg(torch.empty((100,), dtype=torch.float16)) + data = read_file(fp) + output = decode_image(data, apply_exif_orientation=True) + + pimg = Image.open(fp) + pimg = ImageOps.exif_transpose(pimg) + + expected = F.pil_to_tensor(pimg) + torch.testing.assert_close(expected, output) + + +@pytest.mark.parametrize("size", [65533, 1, 7, 10, 23, 33]) +def test_invalid_exif(tmpdir, size): + # Inspired from a PIL test: + # https://github.com/python-pillow/Pillow/blob/8f63748e50378424628155994efd7e0739a4d1d1/Tests/test_file_jpeg.py#L299 + fp = os.path.join(tmpdir, "invalid_exif.jpg") + t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8) + im = F.to_pil_image(t) + im.save(fp, "JPEG", exif=b"1" * size) + + data = read_file(fp) + output = decode_image(data, apply_exif_orientation=True) + + pimg = Image.open(fp) + pimg = ImageOps.exif_transpose(pimg) - with pytest.raises(RuntimeError, match="Not a JPEG file"): - decode_jpeg(torch.empty((100), dtype=torch.uint8)) + expected = F.pil_to_tensor(pimg) + torch.testing.assert_close(expected, output) def test_decode_bad_huffman_images(): @@ -150,7 +192,12 @@ def test_damaged_corrupt_images(img_path): ("RGBA", ImageReadMode.RGB_ALPHA), ], ) -def test_decode_png(img_path, pil_mode, mode): +@pytest.mark.parametrize("scripted", (False, True)) +@pytest.mark.parametrize("decode_fun", (decode_png, decode_image)) +def test_decode_png(img_path, pil_mode, mode, scripted, decode_fun): + + if scripted: + decode_fun = torch.jit.script(decode_fun) with Image.open(img_path) as img: if pil_mode is not None: @@ -160,19 +207,14 @@ def test_decode_png(img_path, pil_mode, mode): img_pil = normalize_dimensions(img_pil) if img_path.endswith("16.png"): - # 16 bits image decoding is supported, but only as a private API - # FIXME: see https://github.com/pytorch/vision/issues/4731 for potential solutions to making it public - with pytest.raises(RuntimeError, match="At most 8-bit PNG images are supported"): - data = read_file(img_path) - img_lpng = decode_image(data, mode=mode) - - img_lpng = _read_png_16(img_path, mode=mode) - assert img_lpng.dtype == torch.int32 - # PIL converts 16 bits pngs in uint8 - img_lpng = torch.round(img_lpng / (2**16 - 1) * 255).to(torch.uint8) + data = read_file(img_path) + img_lpng = decode_fun(data, mode=mode) + assert img_lpng.dtype == torch.uint16 + # PIL converts 16 bits pngs to uint8 + img_lpng = F.to_dtype(img_lpng, torch.uint8, scale=True) else: data = read_file(img_path) - img_lpng = decode_image(data, mode=mode) + img_lpng = decode_fun(data, mode=mode) tol = 0 if pil_mode is None else 1 @@ -187,23 +229,23 @@ def test_decode_png(img_path, pil_mode, mode): def test_decode_png_errors(): - with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"): - decode_png(torch.empty((), dtype=torch.uint8)) - with pytest.raises(RuntimeError, match="Content is not png"): - decode_png(torch.randint(3, 5, (300,), dtype=torch.uint8)) with pytest.raises(RuntimeError, match="Out of bound read in decode_png"): decode_png(read_file(os.path.join(DAMAGED_PNG, "sigsegv.png"))) + with pytest.raises(RuntimeError, match="Content is too small for png"): + decode_png(read_file(os.path.join(TOOSMALL_PNG, "heapbof.png"))) @pytest.mark.parametrize( "img_path", [pytest.param(png_path, id=_get_safe_image_name(png_path)) for png_path in get_images(IMAGE_DIR, ".png")], ) -def test_encode_png(img_path): +@pytest.mark.parametrize("scripted", (True, False)) +def test_encode_png(img_path, scripted): pil_image = Image.open(img_path) img_pil = torch.from_numpy(np.array(pil_image)) img_pil = img_pil.permute(2, 0, 1) - png_buf = encode_png(img_pil, compression_level=6) + encode = torch.jit.script(encode_png) if scripted else encode_png + png_buf = encode(img_pil, compression_level=6) rec_img = Image.open(io.BytesIO(bytes(png_buf.tolist()))) rec_img = torch.from_numpy(np.array(rec_img)) @@ -230,27 +272,39 @@ def test_encode_png_errors(): "img_path", [pytest.param(png_path, id=_get_safe_image_name(png_path)) for png_path in get_images(IMAGE_DIR, ".png")], ) -def test_write_png(img_path, tmpdir): +@pytest.mark.parametrize("scripted", (True, False)) +def test_write_png(img_path, tmpdir, scripted): pil_image = Image.open(img_path) img_pil = torch.from_numpy(np.array(pil_image)) img_pil = img_pil.permute(2, 0, 1) filename, _ = os.path.splitext(os.path.basename(img_path)) torch_png = os.path.join(tmpdir, f"{filename}_torch.png") - write_png(img_pil, torch_png, compression_level=6) + write = torch.jit.script(write_png) if scripted else write_png + write(img_pil, torch_png, compression_level=6) saved_image = torch.from_numpy(np.array(Image.open(torch_png))) saved_image = saved_image.permute(2, 0, 1) assert_equal(img_pil, saved_image) -def test_read_file(tmpdir): +def test_read_image(): + # Just testing torchcsript, the functionality is somewhat tested already in other tests. + path = next(get_images(IMAGE_ROOT, ".jpg")) + out = read_image(path) + out_scripted = torch.jit.script(read_image)(path) + torch.testing.assert_close(out, out_scripted, atol=0, rtol=0) + + +@pytest.mark.parametrize("scripted", (True, False)) +def test_read_file(tmpdir, scripted): fname, content = "test1.bin", b"TorchVision\211\n" fpath = os.path.join(tmpdir, fname) with open(fpath, "wb") as f: f.write(content) - data = read_file(fpath) + fun = torch.jit.script(read_file) if scripted else read_file + data = fun(fpath) expected = torch.tensor(list(content), dtype=torch.uint8) os.unlink(fpath) assert_equal(data, expected) @@ -271,11 +325,13 @@ def test_read_file_non_ascii(tmpdir): assert_equal(data, expected) -def test_write_file(tmpdir): +@pytest.mark.parametrize("scripted", (True, False)) +def test_write_file(tmpdir, scripted): fname, content = "test1.bin", b"TorchVision\211\n" fpath = os.path.join(tmpdir, fname) content_tensor = torch.tensor(list(content), dtype=torch.uint8) - write_file(fpath, content_tensor) + write = torch.jit.script(write_file) if scripted else write_file + write(fpath, content_tensor) with open(fpath, "rb") as f: saved_content = f.read() @@ -343,39 +399,64 @@ def test_read_1_bit_png_consistency(shape, mode, tmpdir): def test_read_interlaced_png(): imgs = list(get_images(INTERLACED_PNG, ".png")) with Image.open(imgs[0]) as im1, Image.open(imgs[1]) as im2: - assert not (im1.info.get("interlace") is im2.info.get("interlace")) + assert im1.info.get("interlace") is not im2.info.get("interlace") img1 = read_image(imgs[0]) img2 = read_image(imgs[1]) assert_equal(img1, img2) @needs_cuda -@pytest.mark.parametrize( - "img_path", - [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")], -) @pytest.mark.parametrize("mode", [ImageReadMode.UNCHANGED, ImageReadMode.GRAY, ImageReadMode.RGB]) @pytest.mark.parametrize("scripted", (False, True)) -def test_decode_jpeg_cuda(mode, img_path, scripted): - if "cmyk" in img_path: - pytest.xfail("Decoding a CMYK jpeg isn't supported") +def test_decode_jpegs_cuda(mode, scripted): + encoded_images = [] + for jpeg_path in get_images(IMAGE_ROOT, ".jpg"): + if "cmyk" in jpeg_path: + continue + encoded_image = read_file(jpeg_path) + encoded_images.append(encoded_image) + decoded_images_cpu = decode_jpeg(encoded_images, mode=mode) + decode_fn = torch.jit.script(decode_jpeg) if scripted else decode_jpeg + + # test multithreaded decoding + # in the current version we prevent this by using a lock but we still want to test it + num_workers = 10 + + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(decode_fn, encoded_images, mode, "cuda") for _ in range(num_workers)] + decoded_images_threaded = [future.result() for future in futures] + assert len(decoded_images_threaded) == num_workers + for decoded_images in decoded_images_threaded: + assert len(decoded_images) == len(encoded_images) + for decoded_image_cuda, decoded_image_cpu in zip(decoded_images, decoded_images_cpu): + assert decoded_image_cuda.shape == decoded_image_cpu.shape + assert decoded_image_cuda.dtype == decoded_image_cpu.dtype == torch.uint8 + assert (decoded_image_cuda.cpu().float() - decoded_image_cpu.cpu().float()).abs().mean() < 2 - data = read_file(img_path) - img = decode_image(data, mode=mode) - f = torch.jit.script(decode_jpeg) if scripted else decode_jpeg - img_nvjpeg = f(data, mode=mode, device="cuda") - # Some difference expected between jpeg implementations - assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2 +@needs_cuda +def test_decode_image_cuda_raises(): + data = torch.randint(0, 127, size=(255,), device="cuda", dtype=torch.uint8) + with pytest.raises(RuntimeError): + decode_image(data) @needs_cuda -@pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda"))) -def test_decode_jpeg_cuda_device_param(cuda_device): - """Make sure we can pass a string or a torch.device as device param""" +def test_decode_jpeg_cuda_device_param(): path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path) data = read_file(path) - decode_jpeg(data, device=cuda_device) + current_device = torch.cuda.current_device() + current_stream = torch.cuda.current_stream() + num_devices = torch.cuda.device_count() + devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)] + results = [] + for device in devices: + results.append(decode_jpeg(data, device=device)) + assert len(results) == len(devices) + for result in results: + assert torch.all(result.cpu() == results[0].cpu()) + assert current_device == torch.cuda.current_device() + assert current_stream == torch.cuda.current_stream() @needs_cuda @@ -383,12 +464,73 @@ def test_decode_jpeg_cuda_errors(): data = read_file(next(get_images(IMAGE_ROOT, ".jpg"))) with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"): decode_jpeg(data.reshape(-1, 1), device="cuda") - with pytest.raises(RuntimeError, match="input tensor must be on CPU"): + with pytest.raises(ValueError, match="must be tensors"): + decode_jpeg([1, 2, 3]) + with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"): decode_jpeg(data.to("cuda"), device="cuda") with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"): decode_jpeg(data.to(torch.float), device="cuda") - with pytest.raises(RuntimeError, match="Expected a cuda device"): - torch.ops.image.decode_jpeg_cuda(data, ImageReadMode.UNCHANGED.value, "cpu") + with pytest.raises(RuntimeError, match="Expected the device parameter to be a cuda device"): + torch.ops.image.decode_jpegs_cuda([data], ImageReadMode.UNCHANGED.value, "cpu") + with pytest.raises(ValueError, match="Input tensor must be a CPU tensor"): + decode_jpeg( + torch.empty((100,), dtype=torch.uint8, device="cuda"), + ) + with pytest.raises(ValueError, match="Input list must contain tensors on CPU"): + decode_jpeg( + [ + torch.empty((100,), dtype=torch.uint8, device="cuda"), + torch.empty((100,), dtype=torch.uint8, device="cuda"), + ] + ) + + with pytest.raises(ValueError, match="Input list must contain tensors on CPU"): + decode_jpeg( + [ + torch.empty((100,), dtype=torch.uint8, device="cuda"), + torch.empty((100,), dtype=torch.uint8, device="cuda"), + ], + device="cuda", + ) + + with pytest.raises(ValueError, match="Input list must contain tensors on CPU"): + decode_jpeg( + [ + torch.empty((100,), dtype=torch.uint8, device="cpu"), + torch.empty((100,), dtype=torch.uint8, device="cuda"), + ], + device="cuda", + ) + + with pytest.raises(RuntimeError, match="Expected a torch.uint8 tensor"): + decode_jpeg( + [ + torch.empty((100,), dtype=torch.uint8), + torch.empty((100,), dtype=torch.float32), + ], + device="cuda", + ) + + with pytest.raises(RuntimeError, match="Expected a non empty 1-dimensional tensor"): + decode_jpeg( + [ + torch.empty((100,), dtype=torch.uint8), + torch.empty((1, 100), dtype=torch.uint8), + ], + device="cuda", + ) + + with pytest.raises(RuntimeError, match="Error while decoding JPEG images"): + decode_jpeg( + [ + torch.empty((100,), dtype=torch.uint8), + torch.empty((100,), dtype=torch.uint8), + ], + device="cuda", + ) + + with pytest.raises(ValueError, match="Input list must contain at least one element"): + decode_jpeg([], device="cuda") def test_encode_jpeg_errors(): @@ -412,102 +554,262 @@ def test_encode_jpeg_errors(): encode_jpeg(torch.empty((100, 100), dtype=torch.uint8)) -def _collect_if(cond): - # TODO: remove this once test_encode_jpeg_reference and test_write_jpeg_reference - # are removed - def _inner(test_func): - if cond: - return test_func - else: - return pytest.mark.dont_collect(test_func) - - return _inner - - -@_collect_if(cond=False) +@pytest.mark.skipif(IS_MACOS, reason="https://github.com/pytorch/vision/issues/8031") @pytest.mark.parametrize( "img_path", [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], ) -def test_encode_jpeg_reference(img_path): - # This test is *wrong*. - # It compares a torchvision-encoded jpeg with a PIL-encoded jpeg (the reference), but it - # starts encoding the torchvision version from an image that comes from - # decode_jpeg, which can yield different results from pil.decode (see - # test_decode... which uses a high tolerance). - # Instead, we should start encoding from the exact same decoded image, for a - # valid comparison. This is done in test_encode_jpeg, but unfortunately - # these more correct tests fail on windows (probably because of a difference - # in libjpeg) between torchvision and PIL. - # FIXME: make the correct tests pass on windows and remove this. - dirname = os.path.dirname(img_path) - filename, _ = os.path.splitext(os.path.basename(img_path)) - write_folder = os.path.join(dirname, "jpeg_write") - expected_file = os.path.join(write_folder, f"{filename}_pil.jpg") - img = decode_jpeg(read_file(img_path)) +@pytest.mark.parametrize("scripted", (True, False)) +def test_encode_jpeg(img_path, scripted): + img = read_image(img_path) - with open(expected_file, "rb") as f: - pil_bytes = f.read() - pil_bytes = torch.as_tensor(list(pil_bytes), dtype=torch.uint8) + pil_img = F.to_pil_image(img) + buf = io.BytesIO() + pil_img.save(buf, format="JPEG", quality=75) + + encoded_jpeg_pil = torch.frombuffer(buf.getvalue(), dtype=torch.uint8) + + encode = torch.jit.script(encode_jpeg) if scripted else encode_jpeg for src_img in [img, img.contiguous()]: - # PIL sets jpeg quality to 75 by default - jpeg_bytes = encode_jpeg(src_img, quality=75) - assert_equal(jpeg_bytes, pil_bytes) + encoded_jpeg_torch = encode(src_img, quality=75) + assert_equal(encoded_jpeg_torch, encoded_jpeg_pil) + + +@needs_cuda +def test_encode_jpeg_cuda_device_param(): + path = next(path for path in get_images(IMAGE_ROOT, ".jpg") if "cmyk" not in path) + + data = read_image(path) + current_device = torch.cuda.current_device() + current_stream = torch.cuda.current_stream() + num_devices = torch.cuda.device_count() + devices = ["cuda", torch.device("cuda")] + [torch.device(f"cuda:{i}") for i in range(num_devices)] + results = [] + for device in devices: + results.append(encode_jpeg(data.to(device=device))) + assert len(results) == len(devices) + for result in results: + assert torch.all(result.cpu() == results[0].cpu()) + assert current_device == torch.cuda.current_device() + assert current_stream == torch.cuda.current_stream() -@_collect_if(cond=False) + +@needs_cuda @pytest.mark.parametrize( "img_path", - [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], + [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(IMAGE_ROOT, ".jpg")], ) -def test_write_jpeg_reference(img_path, tmpdir): - # FIXME: Remove this eventually, see test_encode_jpeg_reference - data = read_file(img_path) - img = decode_jpeg(data) +@pytest.mark.parametrize("scripted", (False, True)) +@pytest.mark.parametrize("contiguous", (False, True)) +def test_encode_jpeg_cuda(img_path, scripted, contiguous): + decoded_image_tv = read_image(img_path) + encode_fn = torch.jit.script(encode_jpeg) if scripted else encode_jpeg - basedir = os.path.dirname(img_path) - filename, _ = os.path.splitext(os.path.basename(img_path)) - torch_jpeg = os.path.join(tmpdir, f"{filename}_torch.jpg") - pil_jpeg = os.path.join(basedir, "jpeg_write", f"{filename}_pil.jpg") + if "cmyk" in img_path: + pytest.xfail("Encoding a CMYK jpeg isn't supported") + if decoded_image_tv.shape[0] == 1: + pytest.xfail("Decoding a grayscale jpeg isn't supported") + # For more detail as to why check out: https://github.com/NVIDIA/cuda-samples/issues/23#issuecomment-559283013 + if contiguous: + decoded_image_tv = decoded_image_tv[None].contiguous(memory_format=torch.contiguous_format)[0] + else: + decoded_image_tv = decoded_image_tv[None].contiguous(memory_format=torch.channels_last)[0] + encoded_jpeg_cuda_tv = encode_fn(decoded_image_tv.cuda(), quality=75) + decoded_jpeg_cuda_tv = decode_jpeg(encoded_jpeg_cuda_tv.cpu()) - write_jpeg(img, torch_jpeg, quality=75) + # the actual encoded bytestreams from libnvjpeg and libjpeg-turbo differ for the same quality + # instead, we re-decode the encoded image and compare to the original + abs_mean_diff = (decoded_jpeg_cuda_tv.float() - decoded_image_tv.float()).abs().mean().item() + assert abs_mean_diff < 3 - with open(torch_jpeg, "rb") as f: - torch_bytes = f.read() - with open(pil_jpeg, "rb") as f: - pil_bytes = f.read() +@needs_cuda +def test_encode_jpeg_cuda_sync(): + """ + Non-regression test for https://github.com/pytorch/vision/issues/8587. + Attempts to reproduce an intermittent CUDA stream synchronization bug + by randomly creating images and round-tripping them via encode_jpeg + and decode_jpeg on the GPU. Fails if the mean difference in uint8 range + exceeds 5. + """ + torch.manual_seed(42) + + # manual testing shows this bug appearing often in iterations between 50 and 100 + # as a synchronization bug, this can't be reliably reproduced + max_iterations = 100 + threshold = 5.0 # in [0..255] + + device = torch.device("cuda") + + for iteration in range(max_iterations): + height, width = torch.randint(4000, 5000, size=(2,)) + + image = torch.linspace(0, 1, steps=height * width, device=device) + image = image.view(1, height, width).expand(3, -1, -1) + + image = (image * 255).clamp(0, 255).to(torch.uint8) + jpeg_bytes = encode_jpeg(image, quality=100) + + decoded_image = decode_jpeg(jpeg_bytes.cpu(), device=device) + mean_difference = (image.float() - decoded_image.float()).abs().mean().item() + + assert mean_difference <= threshold, ( + f"Encode/decode mismatch at iteration={iteration}, " + f"size={height}x{width}, mean diff={mean_difference:.2f}" + ) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("scripted", (True, False)) +@pytest.mark.parametrize("contiguous", (True, False)) +def test_encode_jpegs_batch(scripted, contiguous, device): + if device == "cpu" and IS_MACOS: + pytest.skip("https://github.com/pytorch/vision/issues/8031") + decoded_images_tv = [] + for jpeg_path in get_images(IMAGE_ROOT, ".jpg"): + if "cmyk" in jpeg_path: + continue + decoded_image = read_image(jpeg_path) + if decoded_image.shape[0] == 1: + continue + if contiguous: + decoded_image = decoded_image[None].contiguous(memory_format=torch.contiguous_format)[0] + else: + decoded_image = decoded_image[None].contiguous(memory_format=torch.channels_last)[0] + decoded_images_tv.append(decoded_image) + + encode_fn = torch.jit.script(encode_jpeg) if scripted else encode_jpeg + + decoded_images_tv_device = [img.to(device=device) for img in decoded_images_tv] + encoded_jpegs_tv_device = encode_fn(decoded_images_tv_device, quality=75) + encoded_jpegs_tv_device = [decode_jpeg(img.cpu()) for img in encoded_jpegs_tv_device] + + for original, encoded_decoded in zip(decoded_images_tv, encoded_jpegs_tv_device): + c, h, w = original.shape + abs_mean_diff = (original.float() - encoded_decoded.float()).abs().mean().item() + assert abs_mean_diff < 3 + + # test multithreaded decoding + # in the current version we prevent this by using a lock but we still want to test it + num_workers = 10 + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + futures = [executor.submit(encode_fn, decoded_images_tv_device) for _ in range(num_workers)] + encoded_images_threaded = [future.result() for future in futures] + assert len(encoded_images_threaded) == num_workers + for encoded_images in encoded_images_threaded: + assert len(decoded_images_tv_device) == len(encoded_images) + for i, (encoded_image_cuda, decoded_image_tv) in enumerate(zip(encoded_images, decoded_images_tv_device)): + # make sure all the threads produce identical outputs + assert torch.all(encoded_image_cuda == encoded_images_threaded[0][i]) + + # make sure the outputs are identical or close enough to baseline + decoded_cuda_encoded_image = decode_jpeg(encoded_image_cuda.cpu()) + assert decoded_cuda_encoded_image.shape == decoded_image_tv.shape + assert decoded_cuda_encoded_image.dtype == decoded_image_tv.dtype + assert (decoded_cuda_encoded_image.cpu().float() - decoded_image_tv.cpu().float()).abs().mean() < 3 - assert_equal(torch_bytes, pil_bytes) +@needs_cuda +def test_single_encode_jpeg_cuda_errors(): + with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"): + encode_jpeg(torch.empty((3, 100, 100), dtype=torch.float32, device="cuda")) -# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162. -@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo") -@pytest.mark.parametrize( - "img_path", - [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], -) -def test_encode_jpeg(img_path): - img = read_image(img_path) + with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 5"): + encode_jpeg(torch.empty((5, 100, 100), dtype=torch.uint8, device="cuda")) - pil_img = F.to_pil_image(img) - buf = io.BytesIO() - pil_img.save(buf, format="JPEG", quality=75) + with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 1"): + encode_jpeg(torch.empty((1, 100, 100), dtype=torch.uint8, device="cuda")) - encoded_jpeg_pil = torch.frombuffer(buf.getvalue(), dtype=torch.uint8) + with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"): + encode_jpeg(torch.empty((1, 3, 100, 100), dtype=torch.uint8, device="cuda")) - for src_img in [img, img.contiguous()]: - encoded_jpeg_torch = encode_jpeg(src_img, quality=75) - assert_equal(encoded_jpeg_torch, encoded_jpeg_pil) + with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"): + encode_jpeg(torch.empty((100, 100), dtype=torch.uint8, device="cuda")) + + +@needs_cuda +def test_batch_encode_jpegs_cuda_errors(): + with pytest.raises(RuntimeError, match="Input tensor dtype should be uint8"): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + torch.empty((3, 100, 100), dtype=torch.float32, device="cuda"), + ] + ) + + with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 5"): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + torch.empty((5, 100, 100), dtype=torch.uint8, device="cuda"), + ] + ) + + with pytest.raises(RuntimeError, match="The number of channels should be 3, got: 1"): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + torch.empty((1, 100, 100), dtype=torch.uint8, device="cuda"), + ] + ) + with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + torch.empty((1, 3, 100, 100), dtype=torch.uint8, device="cuda"), + ] + ) -# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162. -@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo") + with pytest.raises(RuntimeError, match="Input data should be a 3-dimensional tensor"): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + torch.empty((100, 100), dtype=torch.uint8, device="cuda"), + ] + ) + + with pytest.raises(RuntimeError, match="Input tensor should be on CPU"): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cpu"), + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + ] + ) + + with pytest.raises( + RuntimeError, match="All input tensors must be on the same CUDA device when encoding with nvjpeg" + ): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda"), + torch.empty((3, 100, 100), dtype=torch.uint8, device="cpu"), + ] + ) + + if torch.cuda.device_count() >= 2: + with pytest.raises( + RuntimeError, match="All input tensors must be on the same CUDA device when encoding with nvjpeg" + ): + encode_jpeg( + [ + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda:0"), + torch.empty((3, 100, 100), dtype=torch.uint8, device="cuda:1"), + ] + ) + + with pytest.raises(ValueError, match="encode_jpeg requires at least one input tensor when a list is passed"): + encode_jpeg([]) + + +@pytest.mark.skipif(IS_MACOS, reason="https://github.com/pytorch/vision/issues/8031") @pytest.mark.parametrize( "img_path", [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], ) -def test_write_jpeg(img_path, tmpdir): +@pytest.mark.parametrize("scripted", (True, False)) +def test_write_jpeg(img_path, tmpdir, scripted): tmpdir = Path(tmpdir) img = read_image(img_path) pil_img = F.to_pil_image(img) @@ -515,7 +817,8 @@ def test_write_jpeg(img_path, tmpdir): torch_jpeg = str(tmpdir / "torch.jpg") pil_jpeg = str(tmpdir / "pil.jpg") - write_jpeg(img, torch_jpeg, quality=75) + write = torch.jit.script(write_jpeg) if scripted else write_jpeg + write(img, torch_jpeg, quality=75) pil_img.save(pil_jpeg, quality=75) with open(torch_jpeg, "rb") as f: @@ -527,5 +830,301 @@ def test_write_jpeg(img_path, tmpdir): assert_equal(torch_bytes, pil_bytes) +def test_pathlib_support(tmpdir): + # Just make sure pathlib.Path is supported where relevant + + jpeg_path = Path(next(get_images(ENCODE_JPEG, ".jpg"))) + + read_file(jpeg_path) + read_image(jpeg_path) + + write_path = Path(tmpdir) / "whatever" + img = torch.randint(0, 10, size=(3, 4, 4), dtype=torch.uint8) + + write_file(write_path, data=img.flatten()) + write_jpeg(img, write_path) + write_png(img, write_path) + + +@pytest.mark.parametrize( + "name", ("gifgrid", "fire", "porsche", "treescap", "treescap-interlaced", "solid2", "x-trans", "earth") +) +@pytest.mark.parametrize("scripted", (True, False)) +def test_decode_gif(tmpdir, name, scripted): + # Using test images from GIFLIB + # https://sourceforge.net/p/giflib/code/ci/master/tree/pic/, we assert PIL + # and torchvision decoded outputs are equal. + # We're not testing against "welcome2" because PIL and GIFLIB disagee on what + # the background color should be (likely a difference in the way they handle + # transparency?) + # 'earth' image is from wikipedia, licensed under CC BY-SA 3.0 + # https://creativecommons.org/licenses/by-sa/3.0/ + # it allows to properly test for transparency, TOP-LEFT offsets, and + # disposal modes. + + path = tmpdir / f"{name}.gif" + if name == "earth": + if IN_OSS_CI: + # TODO: Fix this... one day. + pytest.skip("Skipping 'earth' test as it's flaky on OSS CI") + url = "https://upload.wikimedia.org/wikipedia/commons/2/2c/Rotating_earth_%28large%29.gif" + else: + url = f"https://sourceforge.net/p/giflib/code/ci/master/tree/pic/{name}.gif?format=raw" + with open(path, "wb") as f: + f.write(requests.get(url).content) + + encoded_bytes = read_file(path) + f = torch.jit.script(decode_gif) if scripted else decode_gif + tv_out = f(encoded_bytes) + if tv_out.ndim == 3: + tv_out = tv_out[None] + + assert tv_out.is_contiguous(memory_format=torch.channels_last) + + # For some reason, not using Image.open() as a CM causes "ResourceWarning: unclosed file" + with Image.open(path) as pil_img: + pil_seq = ImageSequence.Iterator(pil_img) + + for pil_frame, tv_frame in zip(pil_seq, tv_out): + pil_frame = F.pil_to_tensor(pil_frame.convert("RGB")) + torch.testing.assert_close(tv_frame, pil_frame, atol=0, rtol=0) + + +@pytest.mark.parametrize( + "decode_fun, match", + [ + (decode_png, "Content is not png"), + (decode_jpeg, "Not a JPEG file"), + (decode_gif, re.escape("DGifOpenFileName() failed - 103")), + (decode_webp, "WebPGetFeatures failed."), + pytest.param( + decode_avif, + "BMFF parsing failed", + # marks=pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE) + marks=pytest.mark.skipif(True, reason="Skipping avif/heic tests for now."), + ), + pytest.param( + decode_heic, + "Invalid input: No 'ftyp' box", + # marks=pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE), + marks=pytest.mark.skipif(True, reason="Skipping avif/heic tests for now."), + ), + ], +) +def test_decode_bad_encoded_data(decode_fun, match): + encoded_data = torch.randint(0, 256, (100,), dtype=torch.uint8) + with pytest.raises(RuntimeError, match="Input tensor must be 1-dimensional"): + decode_fun(encoded_data[None]) + with pytest.raises(RuntimeError, match="Input tensor must have uint8 data type"): + decode_fun(encoded_data.float()) + with pytest.raises(RuntimeError, match="Input tensor must be contiguous"): + decode_fun(encoded_data[::2]) + with pytest.raises(RuntimeError, match=match): + decode_fun(encoded_data) + + +@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image)) +@pytest.mark.parametrize("scripted", (False, True)) +def test_decode_webp(decode_fun, scripted): + encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".webp"))) + if scripted: + decode_fun = torch.jit.script(decode_fun) + img = decode_fun(encoded_bytes) + assert img.shape == (3, 100, 100) + assert img[None].is_contiguous(memory_format=torch.channels_last) + img += 123 # make sure image buffer wasn't freed by underlying decoding lib + + +@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image)) +def test_decode_webp_grayscale(decode_fun, capfd): + encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".webp"))) + + # We warn at the C++ layer because for decode_image(), we don't do the image + # type dispatch until we get to the C++ version of decode_image(). We could + # warn at the Python layer in decode_webp(), but then users would get a + # double wanring: one from the Python layer and one from the C++ layer. + # + # Because we use the TORCH_WARN_ONCE macro, we need to do this dance to + # temporarily always warn so we can test. + @contextlib.contextmanager + def set_always_warn(): + torch._C._set_warnAlways(True) + yield + torch._C._set_warnAlways(False) + + with set_always_warn(): + img = decode_fun(encoded_bytes, mode=ImageReadMode.GRAY) + assert "Webp does not support grayscale conversions" in capfd.readouterr().err + + # Note that because we do not support grayscale conversions, we expect + # that the number of color channels is still 3. + assert img.shape == (3, 100, 100) + + +# This test is skipped by default because it requires webp images that we're not +# including within the repo. The test images were downloaded manually from the +# different pages of https://developers.google.com/speed/webp/gallery +@pytest.mark.skipif(not WEBP_TEST_IMAGES_DIR, reason="WEBP_TEST_IMAGES_DIR is not set") +@pytest.mark.parametrize("decode_fun", (decode_webp, decode_image)) +@pytest.mark.parametrize("scripted", (False, True)) +@pytest.mark.parametrize( + "mode, pil_mode", + ( + # Note that converting an RGBA image to RGB leads to bad results because the + # transparent pixels aren't necessarily set to "black" or "white", they can be + # random stuff. This is consistent with PIL results. + (ImageReadMode.RGB, "RGB"), + (ImageReadMode.RGB_ALPHA, "RGBA"), + (ImageReadMode.UNCHANGED, None), + ), +) +@pytest.mark.parametrize("filename", Path(WEBP_TEST_IMAGES_DIR).glob("*.webp"), ids=lambda p: p.name) +def test_decode_webp_against_pil(decode_fun, scripted, mode, pil_mode, filename): + encoded_bytes = read_file(filename) + if scripted: + decode_fun = torch.jit.script(decode_fun) + img = decode_fun(encoded_bytes, mode=mode) + assert img[None].is_contiguous(memory_format=torch.channels_last) + + pil_img = Image.open(filename).convert(pil_mode) + from_pil = F.pil_to_tensor(pil_img) + assert_equal(img, from_pil) + img += 123 # make sure image buffer wasn't freed by underlying decoding lib + + +# @pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE) +@pytest.mark.skipif(True, reason="Skipping avif/heic tests for now.") +@pytest.mark.parametrize("decode_fun", (decode_avif,)) +def test_decode_avif(decode_fun): + encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".avif"))) + img = decode_fun(encoded_bytes) + assert img.shape == (3, 100, 100) + assert img[None].is_contiguous(memory_format=torch.channels_last) + img += 123 # make sure image buffer wasn't freed by underlying decoding lib + + +# Note: decode_image fails because some of these files have a (valid) signature +# we don't recognize. We should probably use libmagic.... +# @pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE) +@pytest.mark.skipif(True, reason="Skipping avif/heic tests for now.") +@pytest.mark.parametrize("decode_fun", (decode_avif, decode_heic)) +@pytest.mark.parametrize( + "mode, pil_mode", + ( + (ImageReadMode.RGB, "RGB"), + (ImageReadMode.RGB_ALPHA, "RGBA"), + (ImageReadMode.UNCHANGED, None), + ), +) +@pytest.mark.parametrize( + "filename", Path("/home/nicolashug/dev/libavif/tests/data/").glob("*.avif"), ids=lambda p: p.name +) +def test_decode_avif_heic_against_pil(decode_fun, mode, pil_mode, filename): + if "reversed_dimg_order" in str(filename): + # Pillow properly decodes this one, but we don't (order of parts of the + # image is wrong). This is due to a bug that was recently fixed in + # libavif. Hopefully this test will end up passing soon with a new + # libavif version https://github.com/AOMediaCodec/libavif/issues/2311 + pytest.xfail() + import pillow_avif # noqa + + encoded_bytes = read_file(filename) + try: + img = decode_fun(encoded_bytes, mode=mode) + except RuntimeError as e: + if any( + s in str(e) + for s in ( + "BMFF parsing failed", + "avifDecoderParse failed: ", + "file contains more than one image", + "no 'ispe' property", + "'iref' has double references", + "Invalid image grid", + "decode_heif failed: Invalid input: No 'meta' box", + ) + ): + pytest.skip(reason="Expected failure, that's OK") + else: + raise e + assert img[None].is_contiguous(memory_format=torch.channels_last) + if mode == ImageReadMode.RGB: + assert img.shape[0] == 3 + if mode == ImageReadMode.RGB_ALPHA: + assert img.shape[0] == 4 + + if img.dtype == torch.uint16: + img = F.to_dtype(img, dtype=torch.uint8, scale=True) + try: + from_pil = F.pil_to_tensor(Image.open(filename).convert(pil_mode)) + except RuntimeError as e: + if any(s in str(e) for s in ("Invalid image grid", "Failed to decode image: Not implemented")): + pytest.skip(reason="PIL failure") + else: + raise e + + if True: + from torchvision.utils import make_grid + + g = make_grid([img, from_pil]) + F.to_pil_image(g).save(f"/home/nicolashug/out_images/{filename.name}.{pil_mode}.png") + + is_decode_heic = getattr(decode_fun, "__name__", getattr(decode_fun, "name", None)) == "decode_heic" + if mode == ImageReadMode.RGB and not is_decode_heic: + # We don't compare torchvision's AVIF against PIL for RGB because + # results look pretty different on RGBA images (other images are fine). + # The result on torchvision basically just plainly ignores the alpha + # channel, resuting in transparent pixels looking dark. PIL seems to be + # using a sort of k-nn thing (Take a look at the resuting images) + return + if filename.name == "sofa_grid1x5_420.avif" and is_decode_heic: + return + + torch.testing.assert_close(img, from_pil, rtol=0, atol=3) + + +# @pytest.mark.skipif(not IS_LINUX, reason=HEIC_AVIF_MESSAGE) +@pytest.mark.skipif(True, reason="Skipping avif/heic tests for now.") +@pytest.mark.parametrize("decode_fun", (decode_heic,)) +def test_decode_heic(decode_fun): + encoded_bytes = read_file(next(get_images(FAKEDATA_DIR, ".heic"))) + img = decode_fun(encoded_bytes) + assert img.shape == (3, 100, 100) + assert img[None].is_contiguous(memory_format=torch.channels_last) + img += 123 # make sure image buffer wasn't freed by underlying decoding lib + + +@pytest.mark.parametrize("input_type", ("Path", "str", "tensor")) +@pytest.mark.parametrize("scripted", (False, True)) +def test_decode_image_path(input_type, scripted): + # Check that decode_image can support not just tensors as input + path = next(get_images(IMAGE_ROOT, ".jpg")) + if input_type == "Path": + input = Path(path) + elif input_type == "str": + input = path + elif input_type == "tensor": + input = read_file(path) + else: + raise ValueError("Oops") + + if scripted and input_type == "Path": + pytest.xfail(reason="Can't pass a Path when scripting") + + decode_fun = torch.jit.script(decode_image) if scripted else decode_image + decode_fun(input) + + +def test_mode_str(): + # Make sure decode_image supports string modes. We just test decode_image, + # not all of the decoding functions, but they should all support that too. + # Torchscript fails when passing strings, which is expected. + path = next(get_images(IMAGE_ROOT, ".png")) + assert decode_image(path, mode="RGB").shape[0] == 3 + assert decode_image(path, mode="rGb").shape[0] == 3 + assert decode_image(path, mode="GRAY").shape[0] == 1 + assert decode_image(path, mode="RGBA").shape[0] == 4 + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/test_internet.py b/test/test_internet.py index 03828c151c0..34fc3d4aa08 100644 --- a/test/test_internet.py +++ b/test/test_internet.py @@ -6,6 +6,7 @@ """ import os +import pathlib from urllib.error import URLError import pytest @@ -13,7 +14,10 @@ class TestDatasetUtils: - def test_download_url(self, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_download_url(self, tmpdir, use_pathlib): + if use_pathlib: + tmpdir = pathlib.Path(tmpdir) url = "http://github.com/pytorch/vision/archive/master.zip" try: utils.download_url(url, tmpdir) @@ -21,7 +25,10 @@ def test_download_url(self, tmpdir): except URLError: pytest.skip(f"could not download test file '{url}'") - def test_download_url_retry_http(self, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_download_url_retry_http(self, tmpdir, use_pathlib): + if use_pathlib: + tmpdir = pathlib.Path(tmpdir) url = "https://github.com/pytorch/vision/archive/master.zip" try: utils.download_url(url, tmpdir) @@ -29,12 +36,18 @@ def test_download_url_retry_http(self, tmpdir): except URLError: pytest.skip(f"could not download test file '{url}'") - def test_download_url_dont_exist(self, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_download_url_dont_exist(self, tmpdir, use_pathlib): + if use_pathlib: + tmpdir = pathlib.Path(tmpdir) url = "http://github.com/pytorch/vision/archive/this_doesnt_exist.zip" with pytest.raises(URLError): utils.download_url(url, tmpdir) - def test_download_url_dispatch_download_from_google_drive(self, mocker, tmpdir): + @pytest.mark.parametrize("use_pathlib", (True, False)) + def test_download_url_dispatch_download_from_google_drive(self, mocker, tmpdir, use_pathlib): + if use_pathlib: + tmpdir = pathlib.Path(tmpdir) url = "https://drive.google.com/file/d/1GO-BHUYRuvzr1Gtp2_fqXRsr9TIeYbhV/view" id = "1GO-BHUYRuvzr1Gtp2_fqXRsr9TIeYbhV" @@ -44,7 +57,7 @@ def test_download_url_dispatch_download_from_google_drive(self, mocker, tmpdir): mocked = mocker.patch("torchvision.datasets.utils.download_file_from_google_drive") utils.download_url(url, tmpdir, filename, md5) - mocked.assert_called_once_with(id, tmpdir, filename, md5) + mocked.assert_called_once_with(id, os.path.expanduser(tmpdir), filename, md5) if __name__ == "__main__": diff --git a/test/test_io.py b/test/test_io.py index c45180571f0..d2950ac9595 100644 --- a/test/test_io.py +++ b/test/test_io.py @@ -6,7 +6,7 @@ import pytest import torch import torchvision.io as io -from common_utils import assert_equal +from common_utils import assert_equal, cpu_and_cuda from torchvision import get_video_backend @@ -63,7 +63,7 @@ def temp_video(num_frames, height, width, fps, lossless=False, video_codec=None, @pytest.mark.skipif( - get_video_backend() != "pyav" and not io._HAS_VIDEO_OPT, reason="video_reader backend not available" + get_video_backend() != "pyav" and not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend not available" ) @pytest.mark.skipif(av is None, reason="PyAV unavailable") class TestVideo: @@ -77,14 +77,14 @@ def test_write_read_video(self): assert_equal(data, lv) assert info["video_fps"] == 5 - @pytest.mark.skipif(not io._HAS_VIDEO_OPT, reason="video_reader backend is not chosen") + @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen") def test_probe_video_from_file(self): with temp_video(10, 300, 300, 5) as (f_name, data): video_info = io._probe_video_from_file(f_name) assert pytest.approx(2, rel=0.0, abs=0.1) == video_info.video_duration assert pytest.approx(5, rel=0.0, abs=0.1) == video_info.video_fps - @pytest.mark.skipif(not io._HAS_VIDEO_OPT, reason="video_reader backend is not chosen") + @pytest.mark.skipif(not io._HAS_CPU_VIDEO_DECODER, reason="video_reader backend is not chosen") def test_probe_video_from_memory(self): with temp_video(10, 300, 300, 5) as (f_name, data): with open(f_name, "rb") as fp: @@ -255,18 +255,19 @@ def test_read_video_partially_corrupted_file(self): assert_equal(video, data) @pytest.mark.skipif(sys.platform == "win32", reason="temporarily disabled on Windows") - def test_write_video_with_audio(self, tmpdir): + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_write_video_with_audio(self, device, tmpdir): f_name = os.path.join(VIDEO_DIR, "R6llTwEh07w.mp4") video_tensor, audio_tensor, info = io.read_video(f_name, pts_unit="sec") out_f_name = os.path.join(tmpdir, "testing.mp4") io.video.write_video( out_f_name, - video_tensor, + video_tensor.to(device), round(info["video_fps"]), video_codec="libx264rgb", options={"crf": "0"}, - audio_array=audio_tensor, + audio_array=audio_tensor.to(device), audio_fps=info["audio_fps"], audio_codec="aac", ) diff --git a/test/test_models.cpp b/test/test_models.cpp deleted file mode 100644 index 092fc567ac2..00000000000 --- a/test/test_models.cpp +++ /dev/null @@ -1,209 +0,0 @@ -#include -#include -#include - -#include "../torchvision/csrc/models/models.h" - -using namespace vision::models; - -template -torch::Tensor forward_model(const std::string& input_path, torch::Tensor x) { - Model network; - torch::load(network, input_path); - network->eval(); - return network->forward(x); -} - -torch::Tensor forward_alexnet(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_vgg11(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_vgg13(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_vgg16(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_vgg19(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_vgg11bn(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_vgg13bn(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_vgg16bn(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_vgg19bn(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_resnet18(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_resnet34(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_resnet50(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_resnet101( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_resnet152( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_resnext50_32x4d( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_resnext101_32x8d( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_wide_resnet50_2( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_wide_resnet101_2( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_squeezenet1_0( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_squeezenet1_1( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_densenet121( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_densenet169( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_densenet201( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_densenet161( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_mobilenetv2( - const std::string& input_path, - torch::Tensor x) { - return forward_model(input_path, x); -} - -torch::Tensor forward_googlenet( - const std::string& input_path, - torch::Tensor x) { - GoogLeNet network; - torch::load(network, input_path); - network->eval(); - return network->forward(x).output; -} -torch::Tensor forward_inceptionv3( - const std::string& input_path, - torch::Tensor x) { - InceptionV3 network; - torch::load(network, input_path); - network->eval(); - return network->forward(x).output; -} - -torch::Tensor forward_mnasnet0_5(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_mnasnet0_75(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_mnasnet1_0(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} -torch::Tensor forward_mnasnet1_3(const std::string& input_path, torch::Tensor x) { - return forward_model(input_path, x); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward_alexnet", &forward_alexnet, "forward_alexnet"); - - m.def("forward_vgg11", &forward_vgg11, "forward_vgg11"); - m.def("forward_vgg13", &forward_vgg13, "forward_vgg13"); - m.def("forward_vgg16", &forward_vgg16, "forward_vgg16"); - m.def("forward_vgg19", &forward_vgg19, "forward_vgg19"); - - m.def("forward_vgg11bn", &forward_vgg11bn, "forward_vgg11bn"); - m.def("forward_vgg13bn", &forward_vgg13bn, "forward_vgg13bn"); - m.def("forward_vgg16bn", &forward_vgg16bn, "forward_vgg16bn"); - m.def("forward_vgg19bn", &forward_vgg19bn, "forward_vgg19bn"); - - m.def("forward_resnet18", &forward_resnet18, "forward_resnet18"); - m.def("forward_resnet34", &forward_resnet34, "forward_resnet34"); - m.def("forward_resnet50", &forward_resnet50, "forward_resnet50"); - m.def("forward_resnet101", &forward_resnet101, "forward_resnet101"); - m.def("forward_resnet152", &forward_resnet152, "forward_resnet152"); - m.def( - "forward_resnext50_32x4d", - &forward_resnext50_32x4d, - "forward_resnext50_32x4d"); - m.def( - "forward_resnext101_32x8d", - &forward_resnext101_32x8d, - "forward_resnext101_32x8d"); - m.def( - "forward_wide_resnet50_2", - &forward_wide_resnet50_2, - "forward_wide_resnet50_2"); - m.def( - "forward_wide_resnet101_2", - &forward_wide_resnet101_2, - "forward_wide_resnet101_2"); - - m.def( - "forward_squeezenet1_0", &forward_squeezenet1_0, "forward_squeezenet1_0"); - m.def( - "forward_squeezenet1_1", &forward_squeezenet1_1, "forward_squeezenet1_1"); - - m.def("forward_densenet121", &forward_densenet121, "forward_densenet121"); - m.def("forward_densenet169", &forward_densenet169, "forward_densenet169"); - m.def("forward_densenet201", &forward_densenet201, "forward_densenet201"); - m.def("forward_densenet161", &forward_densenet161, "forward_densenet161"); - - m.def("forward_mobilenetv2", &forward_mobilenetv2, "forward_mobilenetv2"); - - m.def("forward_googlenet", &forward_googlenet, "forward_googlenet"); - m.def("forward_inceptionv3", &forward_inceptionv3, "forward_inceptionv3"); - - m.def("forward_mnasnet0_5", &forward_mnasnet0_5, "forward_mnasnet0_5"); - m.def("forward_mnasnet0_75", &forward_mnasnet0_75, "forward_mnasnet0_75"); - m.def("forward_mnasnet1_0", &forward_mnasnet1_0, "forward_mnasnet1_0"); - m.def("forward_mnasnet1_3", &forward_mnasnet1_3, "forward_mnasnet1_3"); -} diff --git a/test/test_models.py b/test/test_models.py index f145727bbd1..202bbdbd0cd 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -3,6 +3,7 @@ import operator import os import pkgutil +import platform import sys import warnings from collections import OrderedDict @@ -14,9 +15,10 @@ import torch.fx import torch.nn as nn from _utils_internal import get_relative_path -from common_utils import cpu_and_gpu, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed -from torchvision import models -from torchvision.models._api import find_model, list_models +from common_utils import cpu_and_cuda, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed +from PIL import Image +from torchvision import models, transforms +from torchvision.models import get_model_builder, list_models ACCEPT = os.getenv("EXPECTTEST_ACCEPT", "0") == "1" @@ -24,7 +26,44 @@ def list_model_fns(module): - return [find_model(name) for name in list_models(module)] + return [get_model_builder(name) for name in list_models(module)] + + +def _get_image(input_shape, real_image, device, dtype=None): + """This routine loads a real or random image based on `real_image` argument. + Currently, the real image is utilized for the following list of models: + - `retinanet_resnet50_fpn`, + - `retinanet_resnet50_fpn_v2`, + - `keypointrcnn_resnet50_fpn`, + - `fasterrcnn_resnet50_fpn`, + - `fasterrcnn_resnet50_fpn_v2`, + - `fcos_resnet50_fpn`, + - `maskrcnn_resnet50_fpn`, + - `maskrcnn_resnet50_fpn_v2`, + in `test_classification_model` and `test_detection_model`. + To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params` + """ + if real_image: + # TODO: Maybe unify file discovery logic with test_image.py + GRACE_HOPPER = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg" + ) + + img = Image.open(GRACE_HOPPER) + + original_width, original_height = img.size + + # make the image square + img = img.crop((0, 0, original_width, original_width)) + img = img.resize(input_shape[1:3]) + + convert_tensor = transforms.ToTensor() + image = convert_tensor(img) + assert tuple(image.size()) == input_shape + return image.to(device=device, dtype=dtype) + + # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests + return torch.rand(input_shape).to(device=device, dtype=dtype) @pytest.fixture @@ -110,10 +149,10 @@ def _assert_expected(output, name, prec=None, atol=None, rtol=None): if binary_size > MAX_PICKLE_SIZE: raise RuntimeError(f"The output for {filename}, is larger than 50kb - got {binary_size}kb") else: - expected = torch.load(expected_file) + expected = torch.load(expected_file, weights_only=True) rtol = rtol or prec # keeping prec param for legacy reason, but could be removed ideally atol = atol or prec - torch.testing.assert_close(output, expected, rtol=rtol, atol=atol, check_dtype=False) + torch.testing.assert_close(output, expected, rtol=rtol, atol=atol, check_dtype=False, check_device=False) def _check_jit_scriptable(nn_module, args, unwrapper=None, eager_out=None): @@ -128,6 +167,7 @@ def get_export_import_copy(m): return imported sm = torch.jit.script(nn_module) + sm.eval() if eager_out is None: with torch.no_grad(), freeze_rng_state(): @@ -153,7 +193,8 @@ def _check_fx_compatible(model, inputs, eager_out=None): model_fx = torch.fx.symbolic_trace(model) if eager_out is None: eager_out = model(inputs) - fx_out = model_fx(inputs) + with torch.no_grad(), freeze_rng_state(): + fx_out = model_fx(inputs) torch.testing.assert_close(eager_out, fx_out) @@ -237,17 +278,23 @@ def _check_input_backprop(model, inputs): # tests under test_quantized_classification_model will be skipped for the following models. quantized_flaky_models = ("inception_v3", "resnet50") +# The tests for the following detection models are flaky. +# We run those tests on float64 to avoid floating point errors. +# FIXME: we shouldn't have to do that :'/ +detection_flaky_models = ("keypointrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn_v2") + # The following contains configuration parameters for all models which are used by # the _test_*_model methods. _model_params = { - "inception_v3": {"input_shape": (1, 3, 299, 299)}, + "inception_v3": {"input_shape": (1, 3, 299, 299), "init_weights": True}, "retinanet_resnet50_fpn": { "num_classes": 20, "score_thresh": 0.01, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "retinanet_resnet50_fpn_v2": { "num_classes": 20, @@ -255,6 +302,7 @@ def _check_input_backprop(model, inputs): "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "keypointrcnn_resnet50_fpn": { "num_classes": 2, @@ -262,18 +310,21 @@ def _check_input_backprop(model, inputs): "max_size": 224, "box_score_thresh": 0.17, "input_shape": (3, 224, 224), + "real_image": True, }, "fasterrcnn_resnet50_fpn": { "num_classes": 20, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "fasterrcnn_resnet50_fpn_v2": { "num_classes": 20, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "fcos_resnet50_fpn": { "num_classes": 2, @@ -281,18 +332,21 @@ def _check_input_backprop(model, inputs): "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "maskrcnn_resnet50_fpn": { "num_classes": 10, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "maskrcnn_resnet50_fpn_v2": { "num_classes": 10, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "fasterrcnn_mobilenet_v3_large_fpn": { "box_score_thresh": 0.02076, @@ -315,6 +369,7 @@ def _check_input_backprop(model, inputs): "s3d": { "input_shape": (1, 3, 16, 224, 224), }, + "googlenet": {"init_weights": True}, } # speeding up slow models: slow_models = [ @@ -343,12 +398,25 @@ def _check_input_backprop(model, inputs): _model_params[m] = {"input_shape": (1, 3, 64, 64)} -# skip big models to reduce memory usage on CI test +# skip big models to reduce memory usage on CI test. We can exclude combinations of (platform-system, device). skipped_big_models = { - "vit_h_14", - "regnet_y_128gf", + "vit_h_14": {("Windows", "cpu"), ("Windows", "cuda")}, + "regnet_y_128gf": {("Windows", "cpu"), ("Windows", "cuda")}, + "mvit_v1_b": {("Windows", "cuda"), ("Linux", "cuda")}, + "mvit_v2_s": {("Windows", "cuda"), ("Linux", "cuda")}, } + +def is_skippable(model_name, device): + if model_name not in skipped_big_models: + return False + + platform_system = platform.system() + device_name = str(device).split(":")[0] + + return (platform_system, device_name) in skipped_big_models[model_name] + + # The following contains configuration and expected values to be used tests that are model specific _model_tests_values = { "retinanet_resnet50_fpn": { @@ -598,13 +666,14 @@ def vitc_b_16(**kwargs: Any): @pytest.mark.parametrize("model_fn", [vitc_b_16]) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_vitc_models(model_fn, dev): test_classification_model(model_fn, dev) +@torch.backends.cudnn.flags(allow_tf32=False) # see: https://github.com/pytorch/vision/issues/7618 @pytest.mark.parametrize("model_fn", list_model_fns(models)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_classification_model(model_fn, dev): set_rng_seed(0) defaults = { @@ -612,18 +681,25 @@ def test_classification_model(model_fn, dev): "input_shape": (1, 3, 224, 224), } model_name = model_fn.__name__ - if SKIP_BIG_MODEL and model_name in skipped_big_models: + if SKIP_BIG_MODEL and is_skippable(model_name, dev): pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model") kwargs = {**defaults, **_model_params.get(model_name, {})} num_classes = kwargs.get("num_classes") input_shape = kwargs.pop("input_shape") + real_image = kwargs.pop("real_image", False) model = model_fn(**kwargs) model.eval().to(device=dev) - # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests - x = torch.rand(input_shape).to(device=dev) + x = _get_image(input_shape=input_shape, real_image=real_image, device=dev) out = model(x) - _assert_expected(out.cpu(), model_name, prec=1e-3) + # FIXME: this if/else is nasty and only here to please our CI prior to the + # release. We rethink these tests altogether. + if model_name == "resnet101": + prec = 0.2 + else: + # FIXME: this is probably still way too high. + prec = 0.1 + _assert_expected(out.cpu(), model_name, prec=prec) assert out.shape[-1] == num_classes _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) _check_fx_compatible(model, x, eager_out=out) @@ -640,7 +716,7 @@ def test_classification_model(model_fn, dev): @pytest.mark.parametrize("model_fn", list_model_fns(models.segmentation)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_segmentation_model(model_fn, dev): set_rng_seed(0) defaults = { @@ -656,7 +732,8 @@ def test_segmentation_model(model_fn, dev): model.eval().to(device=dev) # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests x = torch.rand(input_shape).to(device=dev) - out = model(x) + with torch.no_grad(), freeze_rng_state(): + out = model(x) def check_out(out): prec = 0.01 @@ -670,8 +747,10 @@ def check_out(out): # so instead of validating the probability scores, check that the class # predictions match. expected_file = _get_expected_file(model_name) - expected = torch.load(expected_file) - torch.testing.assert_close(out.argmax(dim=1), expected.argmax(dim=1), rtol=prec, atol=prec) + expected = torch.load(expected_file, weights_only=True) + torch.testing.assert_close( + out.argmax(dim=1), expected.argmax(dim=1), rtol=prec, atol=prec, check_device=False + ) return False # Partial validation performed return True # Full validation performed @@ -682,7 +761,7 @@ def check_out(out): _check_fx_compatible(model, x, eager_out=out) if dev == "cuda": - with torch.cuda.amp.autocast(): + with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state(): out = model(x) # See autocast_flaky_numerics comment at top of file. if model_name not in autocast_flaky_numerics: @@ -702,7 +781,7 @@ def check_out(out): @pytest.mark.parametrize("model_fn", list_model_fns(models.detection)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_detection_model(model_fn, dev): set_rng_seed(0) defaults = { @@ -711,15 +790,20 @@ def test_detection_model(model_fn, dev): "input_shape": (3, 300, 300), } model_name = model_fn.__name__ + if model_name in detection_flaky_models: + dtype = torch.float64 + else: + dtype = torch.get_default_dtype() kwargs = {**defaults, **_model_params.get(model_name, {})} input_shape = kwargs.pop("input_shape") + real_image = kwargs.pop("real_image", False) model = model_fn(**kwargs) - model.eval().to(device=dev) - # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests - x = torch.rand(input_shape).to(device=dev) + model.eval().to(device=dev, dtype=dtype) + x = _get_image(input_shape=input_shape, real_image=real_image, device=dev, dtype=dtype) model_input = [x] - out = model(model_input) + with torch.no_grad(), freeze_rng_state(): + out = model(model_input) assert model_input[0] is x def check_out(out): @@ -763,7 +847,7 @@ def compute_mean_std(tensor): # as in NMSTester.test_nms_cuda to see if this is caused by duplicate # scores. expected_file = _get_expected_file(model_name) - expected = torch.load(expected_file) + expected = torch.load(expected_file, weights_only=True) torch.testing.assert_close( output[0]["scores"], expected[0]["scores"], rtol=prec, atol=prec, check_device=False, check_dtype=False ) @@ -780,7 +864,7 @@ def compute_mean_std(tensor): _check_jit_scriptable(model, ([x],), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) if dev == "cuda": - with torch.cuda.amp.autocast(): + with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state(): out = model(model_input) # See autocast_flaky_numerics comment at top of file. if model_name not in autocast_flaky_numerics: @@ -829,7 +913,7 @@ def test_detection_model_validation(model_fn): @pytest.mark.parametrize("model_fn", list_model_fns(models.video)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_video_model(model_fn, dev): set_rng_seed(0) # the default input shape is @@ -839,7 +923,7 @@ def test_video_model(model_fn, dev): "num_classes": 50, } model_name = model_fn.__name__ - if SKIP_BIG_MODEL and model_name in skipped_big_models: + if SKIP_BIG_MODEL and is_skippable(model_name, dev): pytest.skip("Skipped to reduce memory usage. Set env var SKIP_BIG_MODEL=0 to enable test for this model") kwargs = {**defaults, **_model_params.get(model_name, {})} num_classes = kwargs.get("num_classes") @@ -850,7 +934,7 @@ def test_video_model(model_fn, dev): # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests x = torch.rand(input_shape).to(device=dev) out = model(x) - _assert_expected(out.cpu(), model_name, prec=1e-5) + _assert_expected(out.cpu(), model_name, prec=0.1) assert out.shape[-1] == num_classes _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) _check_fx_compatible(model, x, eager_out=out) @@ -893,7 +977,7 @@ def test_quantized_classification_model(model_fn): out = model(x) if model_name not in quantized_flaky_models: - _assert_expected(out, model_name + "_quantized", prec=2e-2) + _assert_expected(out.cpu(), model_name + "_quantized", prec=2e-2) assert out.shape[-1] == 5 _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) _check_fx_compatible(model, x, eager_out=out) @@ -943,7 +1027,7 @@ def test_raft(model_fn, scripted): torch.manual_seed(0) # We need very small images, otherwise the pickle size would exceed the 50KB - # As a resut we need to override the correlation pyramid to not downsample + # As a result we need to override the correlation pyramid to not downsample # too much, otherwise we would get nan values (effective H and W would be # reduced to 1) corr_block = models.optical_flow.raft.CorrBlock(num_levels=2, radius=2) @@ -959,8 +1043,8 @@ def test_raft(model_fn, scripted): preds = model(img1, img2) flow_pred = preds[-1] # Tolerance is fairly high, but there are 2 * H * W outputs to check - # The .pkl were generated on the AWS cluter, on the CI it looks like the resuts are slightly different - _assert_expected(flow_pred, name=model_fn.__name__, atol=1e-2, rtol=1) + # The .pkl were generated on the AWS cluter, on the CI it looks like the results are slightly different + _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1) if __name__ == "__main__": diff --git a/test/test_models_detection_utils.py b/test/test_models_detection_utils.py index 09895057a9a..69703ab5817 100644 --- a/test/test_models_detection_utils.py +++ b/test/test_models_detection_utils.py @@ -38,7 +38,7 @@ def test_box_linear_coder(self): def test_resnet_fpn_backbone_frozen_layers(self, train_layers, exp_froz_params): # we know how many initial layers and parameters of the network should # be frozen for each trainable_backbone_layers parameter value - # i.e all 53 params are frozen if trainable_backbone_layers=0 + # i.e. all 53 params are frozen if trainable_backbone_layers=0 # ad first 24 params are frozen if trainable_backbone_layers=2 model = backbone_utils.resnet_fpn_backbone("resnet50", weights=None, trainable_layers=train_layers) # boolean list that is true if the param at that index is frozen diff --git a/test/test_onnx.py b/test/test_onnx.py index d5dae64b4d0..c9d91454d7c 100644 --- a/test/test_onnx.py +++ b/test/test_onnx.py @@ -1,6 +1,6 @@ import io from collections import OrderedDict -from typing import List, Tuple +from typing import Optional import pytest import torch @@ -11,7 +11,7 @@ from torchvision.models.detection.roi_heads import RoIHeads from torchvision.models.detection.rpn import AnchorGenerator, RegionProposalNetwork, RPNHead from torchvision.models.detection.transform import GeneralizedRCNNTransform -from torchvision.ops._register_onnx_ops import _onnx_opset_version +from torchvision.ops import _register_onnx_ops # In environments without onnxruntime we prefer to # invoke all tests in the repo and have this one skipped rather than fail. @@ -27,12 +27,15 @@ def run_model( self, model, inputs_list, - tolerate_small_mismatch=False, do_constant_folding=True, dynamic_axes=None, output_names=None, input_names=None, + opset_version: Optional[int] = None, ): + if opset_version is None: + opset_version = _register_onnx_ops.BASE_ONNX_OPSET_VERSION + model.eval() onnx_io = io.BytesIO() @@ -46,10 +49,11 @@ def run_model( torch_onnx_input, onnx_io, do_constant_folding=do_constant_folding, - opset_version=_onnx_opset_version, + opset_version=opset_version, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names, + verbose=True, ) # validate the exported model with onnx runtime for test_inputs in inputs_list: @@ -59,9 +63,9 @@ def run_model( test_ouputs = model(*test_inputs) if isinstance(test_ouputs, torch.Tensor): test_ouputs = (test_ouputs,) - self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch) + self.ort_validate(onnx_io, test_inputs, test_ouputs) - def ort_validate(self, onnx_io, inputs, outputs, tolerate_small_mismatch=False): + def ort_validate(self, onnx_io, inputs, outputs): inputs, _ = torch.jit._flatten(inputs) outputs, _ = torch.jit._flatten(outputs) @@ -75,19 +79,13 @@ def to_numpy(tensor): inputs = list(map(to_numpy, inputs)) outputs = list(map(to_numpy, outputs)) - ort_session = onnxruntime.InferenceSession(onnx_io.getvalue()) + ort_session = onnxruntime.InferenceSession(onnx_io.getvalue(), providers=onnxruntime.get_available_providers()) # compute onnxruntime output prediction ort_inputs = {ort_session.get_inputs()[i].name: inpt for i, inpt in enumerate(inputs)} ort_outs = ort_session.run(None, ort_inputs) for i in range(0, len(outputs)): - try: - torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05) - except AssertionError as error: - if tolerate_small_mismatch: - assert "(0.00%)" in str(error), str(error) - else: - raise + torch.testing.assert_close(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05) def test_nms(self): num_boxes = 100 @@ -140,39 +138,39 @@ def test_roi_align(self): model = ops.RoIAlign((5, 5), 1, -1) self.run_model(model, [(x, single_roi)]) - @pytest.mark.skip(reason="ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16.") def test_roi_align_aligned(self): + supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16 x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1, 2, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 0.5, 3, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1.8, 2, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((2, 2), 2.5, 0, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((2, 2), 2.5, -1, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) - @pytest.mark.skip(reason="Issue in exporting ROIAlign with aligned = True for malformed boxes") def test_roi_align_malformed_boxes(self): + supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16 x = torch.randn(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1, 1, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) def test_roi_pool(self): x = torch.rand(1, 1, 10, 10, dtype=torch.float32) @@ -320,7 +318,6 @@ def forward(self_module, images, features): self.run_model( model, [(images, features), (images2, test_features)], - tolerate_small_mismatch=True, input_names=["input1", "input2", "input3", "input4", "input5", "input6"], dynamic_axes={ "input1": [0, 1, 2, 3], @@ -396,7 +393,6 @@ def forward(self_module, images, features): self.run_model( model, [(images, features), (images2, test_features)], - tolerate_small_mismatch=True, input_names=["input1", "input2", "input3", "input4", "input5", "input6"], dynamic_axes={ "input1": [0, 1, 2, 3], @@ -408,20 +404,19 @@ def forward(self_module, images, features): }, ) - def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor: + def get_image(self, rel_path: str, size: tuple[int, int]) -> torch.Tensor: import os - import torchvision.transforms._pil_constants as _pil_constants from PIL import Image from torchvision.transforms import functional as F data_dir = os.path.join(os.path.dirname(__file__), "assets") path = os.path.join(data_dir, *rel_path.split("/")) - image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR) + image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR) return F.convert_image_dtype(F.pil_to_tensor(image)) - def get_test_images(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + def get_test_images(self) -> tuple[list[torch.Tensor], list[torch.Tensor]]: return ( [self.get_image("encode_jpeg/grace_hopper_517x606.jpg", (100, 320))], [self.get_image("fakedata/logos/rgb_pytorch.png", (250, 380))], @@ -442,7 +437,6 @@ def test_faster_rcnn(self): input_names=["images_tensors"], output_names=["outputs"], dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, - tolerate_small_mismatch=True, ) # Test exported model for an image with no detections on other images self.run_model( @@ -451,7 +445,6 @@ def test_faster_rcnn(self): input_names=["images_tensors"], output_names=["outputs"], dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, - tolerate_small_mismatch=True, ) # Verify that paste_mask_in_image beahves the same in tracing. @@ -506,7 +499,6 @@ def test_mask_rcnn(self): "scores": [0], "masks": [0, 1, 2], }, - tolerate_small_mismatch=True, ) # Test exported model for an image with no detections on other images self.run_model( @@ -521,7 +513,6 @@ def test_mask_rcnn(self): "scores": [0], "masks": [0, 1, 2], }, - tolerate_small_mismatch=True, ) # Verify that heatmaps_to_keypoints behaves the same in tracing. @@ -563,7 +554,6 @@ def test_keypoint_rcnn(self): input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], dynamic_axes={"images_tensors": [0, 1, 2]}, - tolerate_small_mismatch=True, ) self.run_model( @@ -572,7 +562,6 @@ def test_keypoint_rcnn(self): input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], dynamic_axes={"images_tensors": [0, 1, 2]}, - tolerate_small_mismatch=True, ) def test_shufflenet_v2_dynamic_axes(self): @@ -586,7 +575,6 @@ def test_shufflenet_v2_dynamic_axes(self): input_names=["input_images"], output_names=["output"], dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}}, - tolerate_small_mismatch=True, ) diff --git a/test/test_ops.py b/test/test_ops.py index a1cb5aa33d6..d2cf8d29181 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -1,24 +1,50 @@ import math import os from abc import ABC, abstractmethod -from functools import lru_cache +from functools import lru_cache, partial from itertools import product -from typing import Callable, List, Tuple +from typing import Callable import numpy as np import pytest import torch import torch.fx import torch.nn.functional as F -from common_utils import assert_equal, cpu_and_gpu, needs_cuda +import torch.testing._internal.optests as optests +from common_utils import assert_equal, cpu_and_cuda, cpu_and_cuda_and_mps, needs_cuda, needs_mps from PIL import Image from torch import nn, Tensor +from torch._dynamo.utils import is_compile_supported from torch.autograd import gradcheck from torch.nn.modules.utils import _pair from torchvision import models, ops from torchvision.models.feature_extraction import get_graph_node_names +OPTESTS = [ + "test_schema", + "test_autograd_registration", + "test_faketensor", + "test_aot_dispatch_dynamic", +] + + +# Context manager for setting deterministic flag and automatically +# resetting it to its original value +class DeterministicGuard: + def __init__(self, deterministic, *, warn_only=False): + self.deterministic = deterministic + self.warn_only = warn_only + + def __enter__(self): + self.deterministic_restore = torch.are_deterministic_algorithms_enabled() + self.warn_only_restore = torch.is_deterministic_algorithms_warn_only_enabled() + torch.use_deterministic_algorithms(self.deterministic, warn_only=self.warn_only) + + def __exit__(self, exception_type, exception_value, traceback): + torch.use_deterministic_algorithms(self.deterministic_restore, warn_only=self.warn_only_restore) + + class RoIOpTesterModuleWrapper(nn.Module): def __init__(self, obj): super().__init__() @@ -74,20 +100,43 @@ def __init__(self, pool: nn.Module): super().__init__() self.pool = pool - def forward(self, imgs: Tensor, boxes: List[Tensor]) -> Tensor: + def forward(self, imgs: Tensor, boxes: list[Tensor]) -> Tensor: return self.pool(imgs, boxes) class RoIOpTester(ABC): dtype = torch.float64 + mps_dtype = torch.float32 + mps_backward_atol = 2e-2 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @pytest.mark.parametrize("contiguous", (True, False)) - def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwargs): - x_dtype = self.dtype if x_dtype is None else x_dtype - rois_dtype = self.dtype if rois_dtype is None else rois_dtype + @pytest.mark.parametrize( + "x_dtype", + ( + torch.float16, + torch.float32, + torch.float64, + ), + ids=str, + ) + def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, deterministic=False, **kwargs): + if device == "mps" and x_dtype is torch.float64: + pytest.skip("MPS does not support float64") + + rois_dtype = x_dtype if rois_dtype is None else rois_dtype + + tol = 1e-5 + if x_dtype is torch.half: + if device == "mps": + tol = 5e-3 + else: + tol = 4e-3 + elif x_dtype == torch.bfloat16: + tol = 5e-3 + pool_size = 5 - # n_channels % (pool_size ** 2) == 0 required for PS opeartions. + # n_channels % (pool_size ** 2) == 0 required for PS operations. n_channels = 2 * (pool_size**2) x = torch.rand(2, n_channels, 10, 10, dtype=x_dtype, device=device) if not contiguous: @@ -99,17 +148,17 @@ def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwar ) pool_h, pool_w = pool_size, pool_size - y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs) + with DeterministicGuard(deterministic): + y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs) # the following should be true whether we're running an autocast test or not. assert y.dtype == x.dtype gt_y = self.expected_fn( - x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=self.dtype, **kwargs + x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=x_dtype, **kwargs ) - tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5 torch.testing.assert_close(gt_y.to(y), y, rtol=tol, atol=tol) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_is_leaf_node(self, device): op_obj = self.make_obj(wrap=True).to(device=device) graph_node_names = get_graph_node_names(op_obj) @@ -118,17 +167,39 @@ def test_is_leaf_node(self, device): assert len(graph_node_names[0]) == len(graph_node_names[1]) assert len(graph_node_names[0]) == 1 + op_obj.n_inputs + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_torch_fx_trace(self, device, x_dtype=torch.float, rois_dtype=torch.float): + op_obj = self.make_obj().to(device=device) + graph_module = torch.fx.symbolic_trace(op_obj) + pool_size = 5 + n_channels = 2 * (pool_size**2) + x = torch.rand(2, n_channels, 5, 5, dtype=x_dtype, device=device) + rois = torch.tensor( + [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9], [1, 0, 0, 9, 9]], # format is (xyxy) + dtype=rois_dtype, + device=device, + ) + output_gt = op_obj(x, rois) + assert output_gt.dtype == x.dtype + output_fx = graph_module(x, rois) + assert output_fx.dtype == x.dtype + tol = 1e-5 + torch.testing.assert_close(output_gt, output_fx, rtol=tol, atol=tol) + @pytest.mark.parametrize("seed", range(10)) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @pytest.mark.parametrize("contiguous", (True, False)) - def test_backward(self, seed, device, contiguous): + def test_backward(self, seed, device, contiguous, deterministic=False): + atol = self.mps_backward_atol if device == "mps" else 1e-05 + dtype = self.mps_dtype if device == "mps" else self.dtype + torch.random.manual_seed(seed) pool_size = 2 - x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=self.dtype, device=device, requires_grad=True) + x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=dtype, device=device, requires_grad=True) if not contiguous: x = x.permute(0, 1, 3, 2) rois = torch.tensor( - [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=self.dtype, device=device # format is (xyxy) + [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=dtype, device=device # format is (xyxy) ) def func(z): @@ -136,8 +207,26 @@ def func(z): script_func = self.get_script_fn(rois, pool_size) - gradcheck(func, (x,)) - gradcheck(script_func, (x,)) + with DeterministicGuard(deterministic): + gradcheck(func, (x,), atol=atol) + + gradcheck(script_func, (x,), atol=atol) + + @needs_mps + def test_mps_error_inputs(self): + pool_size = 2 + x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=torch.float16, device="mps", requires_grad=True) + rois = torch.tensor( + [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=torch.float16, device="mps" # format is (xyxy) + ) + + def func(z): + return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1) + + with pytest.raises( + RuntimeError, match="MPS does not support (?:ps_)?roi_(?:align|pool)? backward with float16 inputs." + ): + gradcheck(func, (x,)) @needs_cuda @pytest.mark.parametrize("x_dtype", (torch.float, torch.half)) @@ -153,7 +242,7 @@ def _helper_boxes_shape(self, func): boxes = torch.tensor([[0, 0, 3, 3]], dtype=a.dtype) func(a, boxes, output_size=(2, 2)) - # test boxes as List[Tensor[N, 4]] + # test boxes as list[Tensor[N, 4]] with pytest.raises(AssertionError): a = torch.linspace(1, 8 * 8, 8 * 8).reshape(1, 1, 8, 8) boxes = torch.tensor([[0, 0, 3]], dtype=a.dtype) @@ -233,6 +322,8 @@ def test_jit_boxes_list(self): class TestPSRoIPool(RoIOpTester): + mps_backward_atol = 5e-2 + def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois) @@ -314,6 +405,8 @@ def bilinear_interpolate(data, y, x, snap_border=False): class TestRoIAlign(RoIOpTester): + mps_backward_atol = 6e-2 + def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, aligned=False, **kwargs): return ops.RoIAlign( (pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio, aligned=aligned @@ -365,7 +458,6 @@ def expected_fn( grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w)) for channel in range(0, n_channels): - val = 0 for iy in range(0, grid_h): y = start_h + (iy + 0.5) * bin_h / grid_h @@ -381,23 +473,70 @@ def test_boxes_shape(self): self._helper_boxes_shape(ops.roi_align) @pytest.mark.parametrize("aligned", (True, False)) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) + @pytest.mark.parametrize("x_dtype", (torch.float16, torch.float32, torch.float64)) # , ids=str) @pytest.mark.parametrize("contiguous", (True, False)) - def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=None): + @pytest.mark.parametrize("deterministic", (True, False)) + @pytest.mark.opcheck_only_one() + def test_forward(self, device, contiguous, deterministic, aligned, x_dtype, rois_dtype=None): + if deterministic and device == "cpu": + pytest.skip("cpu is always deterministic, don't retest") super().test_forward( - device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype, aligned=aligned + device=device, + contiguous=contiguous, + deterministic=deterministic, + x_dtype=x_dtype, + rois_dtype=rois_dtype, + aligned=aligned, ) @needs_cuda @pytest.mark.parametrize("aligned", (True, False)) + @pytest.mark.parametrize("deterministic", (True, False)) @pytest.mark.parametrize("x_dtype", (torch.float, torch.half)) @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half)) - def test_autocast(self, aligned, x_dtype, rois_dtype): + @pytest.mark.opcheck_only_one() + def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype): with torch.cuda.amp.autocast(): self.test_forward( - torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype + torch.device("cuda"), + contiguous=False, + deterministic=deterministic, + aligned=aligned, + x_dtype=x_dtype, + rois_dtype=rois_dtype, ) + @pytest.mark.skip(reason="1/5000 flaky failure") + @pytest.mark.parametrize("aligned", (True, False)) + @pytest.mark.parametrize("deterministic", (True, False)) + @pytest.mark.parametrize("x_dtype", (torch.float, torch.bfloat16)) + @pytest.mark.parametrize("rois_dtype", (torch.float, torch.bfloat16)) + def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype): + with torch.cpu.amp.autocast(): + self.test_forward( + torch.device("cpu"), + contiguous=False, + deterministic=deterministic, + aligned=aligned, + x_dtype=x_dtype, + rois_dtype=rois_dtype, + ) + + @pytest.mark.parametrize("seed", range(10)) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) + @pytest.mark.parametrize("contiguous", (True, False)) + @pytest.mark.parametrize("deterministic", (True, False)) + @pytest.mark.opcheck_only_one() + def test_backward(self, seed, device, contiguous, deterministic): + if deterministic and device == "cpu": + pytest.skip("cpu is always deterministic, don't retest") + if deterministic and device == "mps": + pytest.skip("no deterministic implementation for mps") + if deterministic and not is_compile_supported(device): + pytest.skip("deterministic implementation only if torch.compile supported") + super().test_backward(seed, device, contiguous, deterministic) + def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000): rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype) rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,)) # set batch index @@ -407,6 +546,7 @@ def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000): @pytest.mark.parametrize("aligned", (True, False)) @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 10), (0.1, 50))) @pytest.mark.parametrize("qdtype", (torch.qint8, torch.quint8, torch.qint32)) + @pytest.mark.opcheck_only_one() def test_qroialign(self, aligned, scale, zero_point, qdtype): """Make sure quantized version of RoIAlign is close to float version""" pool_size = 5 @@ -477,6 +617,8 @@ def test_jit_boxes_list(self): class TestPSRoIAlign(RoIOpTester): + mps_backward_atol = 5e-2 + def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)(x, rois) @@ -531,6 +673,43 @@ def test_boxes_shape(self): self._helper_boxes_shape(ops.ps_roi_align) +@pytest.mark.parametrize( + "op", + ( + torch.ops.torchvision.roi_pool, + torch.ops.torchvision.ps_roi_pool, + torch.ops.torchvision.roi_align, + torch.ops.torchvision.ps_roi_align, + ), +) +@pytest.mark.parametrize("dtype", (torch.float16, torch.float32, torch.float64)) +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("requires_grad", (True, False)) +def test_roi_opcheck(op, dtype, device, requires_grad): + # This manually calls opcheck() on the roi ops. We do that instead of + # relying on opcheck.generate_opcheck_tests() as e.g. done for nms, because + # pytest and generate_opcheck_tests() don't interact very well when it comes + # to skipping tests - and these ops need to skip the MPS tests since MPS we + # don't support dynamic shapes yet for MPS. + rois = torch.tensor( + [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9], [1, 0, 0, 9, 9]], + dtype=dtype, + device=device, + requires_grad=requires_grad, + ) + pool_size = 5 + num_channels = 2 * (pool_size**2) + x = torch.rand(2, num_channels, 10, 10, dtype=dtype, device=device) + + kwargs = dict(rois=rois, spatial_scale=1, pooled_height=pool_size, pooled_width=pool_size) + if op in (torch.ops.torchvision.roi_align, torch.ops.torchvision.ps_roi_align): + kwargs["sampling_ratio"] = -1 + if op is torch.ops.torchvision.roi_align: + kwargs["aligned"] = True + + optests.opcheck(op, args=(x,), kwargs=kwargs) + + class TestMultiScaleRoIAlign: def make_obj(self, fmap_names=None, output_size=(7, 7), sampling_ratio=2, wrap=False): if fmap_names is None: @@ -552,7 +731,7 @@ def test_msroialign_repr(self): ) assert repr(t) == expected_string - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_is_leaf_node(self, device): op_obj = self.make_obj(wrap=True).to(device=device) graph_node_names = get_graph_node_names(op_obj) @@ -566,8 +745,9 @@ class TestNMS: def _reference_nms(self, boxes, scores, iou_threshold): """ Args: - box_scores (N, 5): boxes in corner-form and probabilities. - iou_threshold: intersection over union threshold. + boxes: boxes in corner-form + scores: probabilities + iou_threshold: intersection over union threshold Returns: picked: a list of indexes of the kept boxes """ @@ -605,13 +785,14 @@ def _create_tensors_with_iou(self, N, iou_thresh): @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("seed", range(10)) + @pytest.mark.opcheck_only_one() def test_nms_ref(self, iou, seed): torch.random.manual_seed(seed) err_msg = "NMS incompatible between CPU and reference implementation for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) keep_ref = self._reference_nms(boxes, scores, iou) keep = ops.nms(boxes, scores, iou) - assert torch.allclose(keep, keep_ref), err_msg.format(iou) + torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou)) def test_nms_input_errors(self): with pytest.raises(RuntimeError): @@ -625,13 +806,14 @@ def test_nms_input_errors(self): @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 50), (3, 10))) + @pytest.mark.opcheck_only_one() def test_qnms(self, iou, scale, zero_point): # Note: we compare qnms vs nms instead of qnms vs reference implementation. - # This is because with the int convertion, the trick used in _create_tensors_with_iou + # This is because with the int conversion, the trick used in _create_tensors_with_iou # doesn't really work (in fact, nms vs reference implem will also fail with ints) err_msg = "NMS and QNMS give different results for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) - scores *= 100 # otherwise most scores would be 0 or 1 after int convertion + scores *= 100 # otherwise most scores would be 0 or 1 after int conversion qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point, dtype=torch.quint8) qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point, dtype=torch.quint8) @@ -642,42 +824,67 @@ def test_qnms(self, iou, scale, zero_point): keep = ops.nms(boxes, scores, iou) qkeep = ops.nms(qboxes, qscores, iou) - assert torch.allclose(qkeep, keep), err_msg.format(iou) + torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou)) - @needs_cuda + @pytest.mark.parametrize( + "device", + ( + pytest.param("cuda", marks=pytest.mark.needs_cuda), + pytest.param("mps", marks=pytest.mark.needs_mps), + ), + ) @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) - def test_nms_cuda(self, iou, dtype=torch.float64): + @pytest.mark.opcheck_only_one() + def test_nms_gpu(self, iou, device, dtype=torch.float64): + dtype = torch.float32 if device == "mps" else dtype tol = 1e-3 if dtype is torch.half else 1e-5 err_msg = "NMS incompatible between CPU and CUDA for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) r_cpu = ops.nms(boxes, scores, iou) - r_cuda = ops.nms(boxes.cuda(), scores.cuda(), iou) + r_gpu = ops.nms(boxes.to(device), scores.to(device), iou) - is_eq = torch.allclose(r_cpu, r_cuda.cpu()) + is_eq = torch.allclose(r_cpu, r_gpu.cpu()) if not is_eq: # if the indices are not the same, ensure that it's because the scores # are duplicate - is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol) + is_eq = torch.allclose(scores[r_cpu], scores[r_gpu.cpu()], rtol=tol, atol=tol) assert is_eq, err_msg.format(iou) @needs_cuda @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) @pytest.mark.parametrize("dtype", (torch.float, torch.half)) + @pytest.mark.opcheck_only_one() def test_autocast(self, iou, dtype): with torch.cuda.amp.autocast(): - self.test_nms_cuda(iou=iou, dtype=dtype) + self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda") - @needs_cuda - def test_nms_cuda_float16(self): + @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) + @pytest.mark.parametrize("dtype", (torch.float, torch.bfloat16)) + def test_autocast_cpu(self, iou, dtype): + boxes, scores = self._create_tensors_with_iou(1000, iou) + with torch.cpu.amp.autocast(): + keep_ref_float = ops.nms(boxes.to(dtype).float(), scores.to(dtype).float(), iou) + keep_dtype = ops.nms(boxes.to(dtype), scores.to(dtype), iou) + torch.testing.assert_close(keep_ref_float, keep_dtype) + + @pytest.mark.parametrize( + "device", + ( + pytest.param("cuda", marks=pytest.mark.needs_cuda), + pytest.param("mps", marks=pytest.mark.needs_mps), + ), + ) + @pytest.mark.opcheck_only_one() + def test_nms_float16(self, device): boxes = torch.tensor( [ [285.3538, 185.5758, 1193.5110, 851.4551], [285.1472, 188.7374, 1192.4984, 851.0669], [279.2440, 197.9812, 1189.4746, 849.2019], ] - ).cuda() - scores = torch.tensor([0.6370, 0.7569, 0.3966]).cuda() + ).to(device) + scores = torch.tensor([0.6370, 0.7569, 0.3966]).to(device) iou_thres = 0.2 keep32 = ops.nms(boxes, scores, iou_thres) @@ -685,6 +892,7 @@ def test_nms_cuda_float16(self): assert_equal(keep32, keep16) @pytest.mark.parametrize("seed", range(10)) + @pytest.mark.opcheck_only_one() def test_batched_nms_implementations(self, seed): """Make sure that both implementations of batched_nms yield identical results""" torch.random.manual_seed(seed) @@ -710,8 +918,18 @@ def test_batched_nms_implementations(self, seed): torch.testing.assert_close(empty, ops.batched_nms(empty, None, None, None)) +optests.generate_opcheck_tests( + testcase=TestNMS, + namespaces=["torchvision"], + failures_dict_path=os.path.join(os.path.dirname(__file__), "optests_failures_dict.json"), + additional_decorators=[], + test_utils=OPTESTS, +) + + class TestDeformConv: dtype = torch.float64 + mps_dtype = torch.float32 def expected_fn(self, x, weight, offset, mask, bias, stride=1, padding=0, dilation=1): stride_h, stride_w = _pair(stride) @@ -824,7 +1042,7 @@ def make_obj(self, in_channels=6, out_channels=2, kernel_size=(3, 2), groups=2, ) return DeformConvModuleWrapper(obj) if wrap else obj - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_is_leaf_node(self, device): op_obj = self.make_obj(wrap=True).to(device=device) graph_node_names = get_graph_node_names(op_obj) @@ -833,11 +1051,11 @@ def test_is_leaf_node(self, device): assert len(graph_node_names[0]) == len(graph_node_names[1]) assert len(graph_node_names[0]) == 1 + op_obj.n_inputs - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @pytest.mark.parametrize("contiguous", (True, False)) @pytest.mark.parametrize("batch_sz", (0, 33)) def test_forward(self, device, contiguous, batch_sz, dtype=None): - dtype = dtype or self.dtype + dtype = self.mps_dtype if device == "mps" else dtype or self.dtype x, _, offset, mask, _, stride, padding, dilation = self.get_fn_args(device, contiguous, batch_sz, dtype) in_channels = 6 out_channels = 2 @@ -855,7 +1073,7 @@ def test_forward(self, device, contiguous, batch_sz, dtype=None): expected = self.expected_fn(x, weight, offset, mask, bias, stride=stride, padding=padding, dilation=dilation) torch.testing.assert_close( - res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres:\n{res}\nexpected:\n{expected}" + res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres: \n{res}\nexpected: \n{expected}" ) # no modulation test @@ -863,7 +1081,7 @@ def test_forward(self, device, contiguous, batch_sz, dtype=None): expected = self.expected_fn(x, weight, offset, None, bias, stride=stride, padding=padding, dilation=dilation) torch.testing.assert_close( - res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres:\n{res}\nexpected:\n{expected}" + res.to(expected), expected, rtol=tol, atol=tol, msg=f"\nres: \n{res}\nexpected: \n{expected}" ) def test_wrong_sizes(self): @@ -885,9 +1103,10 @@ def test_wrong_sizes(self): wrong_mask = torch.rand_like(mask[:, :2]) layer(x, offset, wrong_mask) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("contiguous", (True, False)) @pytest.mark.parametrize("batch_sz", (0, 33)) + @pytest.mark.opcheck_only_one() def test_backward(self, device, contiguous, batch_sz): x, weight, offset, mask, bias, stride, padding, dilation = self.get_fn_args( device, contiguous, batch_sz, self.dtype @@ -937,6 +1156,7 @@ def script_func_no_mask(x_, offset_, weight_, bias_, stride_, pad_, dilation_): @needs_cuda @pytest.mark.parametrize("contiguous", (True, False)) + @pytest.mark.opcheck_only_one() def test_compare_cpu_cuda_grads(self, contiguous): # Test from https://github.com/pytorch/vision/issues/2598 # Run on CUDA only @@ -958,7 +1178,6 @@ def test_compare_cpu_cuda_grads(self, contiguous): weight = init_weight for d in ["cpu", "cuda"]: - out = ops.deform_conv2d(img.to(d), offset.to(d), weight.to(d), padding=1, mask=mask.to(d)) out.mean().backward() if true_cpu_grads is None: @@ -972,6 +1191,7 @@ def test_compare_cpu_cuda_grads(self, contiguous): @needs_cuda @pytest.mark.parametrize("batch_sz", (0, 33)) @pytest.mark.parametrize("dtype", (torch.float, torch.half)) + @pytest.mark.opcheck_only_one() def test_autocast(self, batch_sz, dtype): with torch.cuda.amp.autocast(): self.test_forward(torch.device("cuda"), contiguous=False, batch_sz=batch_sz, dtype=dtype) @@ -981,6 +1201,34 @@ def test_forward_scriptability(self): torch.jit.script(ops.DeformConv2d(in_channels=8, out_channels=8, kernel_size=3)) +# NS: Remove me once backward is implemented for MPS +def xfail_if_mps(x): + mps_xfail_param = pytest.param("mps", marks=(pytest.mark.needs_mps, pytest.mark.xfail)) + new_pytestmark = [] + for mark in x.pytestmark: + if isinstance(mark, pytest.Mark) and mark.name == "parametrize": + if mark.args[0] == "device": + params = cpu_and_cuda() + (mps_xfail_param,) + new_pytestmark.append(pytest.mark.parametrize("device", params)) + continue + new_pytestmark.append(mark) + x.__dict__["pytestmark"] = new_pytestmark + return x + + +optests.generate_opcheck_tests( + testcase=TestDeformConv, + namespaces=["torchvision"], + failures_dict_path=os.path.join(os.path.dirname(__file__), "optests_failures_dict.json"), + # Skip tests due to unimplemented backward + additional_decorators={ + "test_aot_dispatch_dynamic__test_forward": [xfail_if_mps], + "test_autograd_registration__test_forward": [xfail_if_mps], + }, + test_utils=OPTESTS, +) + + class TestFrozenBNT: def test_frozenbatchnorm2d_repr(self): num_features = 32 @@ -1110,8 +1358,69 @@ def test_bbox_xywh_cxcywh(self): box_xywh = ops.box_convert(box_cxcywh, in_fmt="cxcywh", out_fmt="xywh") assert_equal(box_xywh, box_tensor) - @pytest.mark.parametrize("inv_infmt", ["xwyh", "cxwyh"]) - @pytest.mark.parametrize("inv_outfmt", ["xwcx", "xhwcy"]) + def test_bbox_xywhr_cxcywhr(self): + box_tensor = torch.tensor( + [ + [0, 0, 100, 100, 0], + [0, 0, 0, 0, 0], + [10, 15, 20, 20, 0], + [23, 35, 70, 60, 0], + [4, 2, 4, 2, 0], + [5, 5, 4, 2, 90], + [8, 4, 4, 2, 180], + [7, 1, 4, 2, -90], + ], + dtype=torch.float, + ) + + exp_cxcywhr = torch.tensor( + [ + [50, 50, 100, 100, 0], + [0, 0, 0, 0, 0], + [20, 25, 20, 20, 0], + [58, 65, 70, 60, 0], + [6, 3, 4, 2, 0], + [6, 3, 4, 2, 90], + [6, 3, 4, 2, 180], + [6, 3, 4, 2, -90], + ], + dtype=torch.float, + ) + + assert exp_cxcywhr.size() == torch.Size([8, 5]) + box_cxcywhr = ops.box_convert(box_tensor, in_fmt="xywhr", out_fmt="cxcywhr") + torch.testing.assert_close(box_cxcywhr, exp_cxcywhr) + + # Reverse conversion + box_xywhr = ops.box_convert(box_cxcywhr, in_fmt="cxcywhr", out_fmt="xywhr") + torch.testing.assert_close(box_xywhr, box_tensor) + + def test_bbox_cxcywhr_to_xyxyxyxy(self): + box_tensor = torch.tensor([[5, 3, 4, 2, 90]], dtype=torch.float) + exp_xyxyxyxy = torch.tensor([[4, 5, 4, 1, 6, 1, 6, 5]], dtype=torch.float) + + assert exp_xyxyxyxy.size() == torch.Size([1, 8]) + box_xyxyxyxy = ops.box_convert(box_tensor, in_fmt="cxcywhr", out_fmt="xyxyxyxy") + torch.testing.assert_close(box_xyxyxyxy, exp_xyxyxyxy) + + # Reverse conversion + box_cxcywhr = ops.box_convert(box_xyxyxyxy, in_fmt="xyxyxyxy", out_fmt="cxcywhr") + torch.testing.assert_close(box_cxcywhr, box_tensor) + + def test_bbox_xywhr_to_xyxyxyxy(self): + box_tensor = torch.tensor([[4, 5, 4, 2, 90]], dtype=torch.float) + exp_xyxyxyxy = torch.tensor([[4, 5, 4, 1, 6, 1, 6, 5]], dtype=torch.float) + + assert exp_xyxyxyxy.size() == torch.Size([1, 8]) + box_xyxyxyxy = ops.box_convert(box_tensor, in_fmt="xywhr", out_fmt="xyxyxyxy") + torch.testing.assert_close(box_xyxyxyxy, exp_xyxyxyxy) + + # Reverse conversion + box_xywhr = ops.box_convert(box_xyxyxyxy, in_fmt="xyxyxyxy", out_fmt="xywhr") + torch.testing.assert_close(box_xywhr, box_tensor) + + @pytest.mark.parametrize("inv_infmt", ["xwyh", "cxwyh", "xwyhr", "cxwyhr", "xxxxyyyy"]) + @pytest.mark.parametrize("inv_outfmt", ["xwcx", "xhwcy", "xwcxr", "xhwcyr", "xyxyxxyy"]) def test_bbox_invalid(self, inv_infmt, inv_outfmt): box_tensor = torch.tensor( [[0, 0, 100, 100], [0, 0, 0, 0], [10, 15, 20, 20], [23, 35, 70, 60]], dtype=torch.float @@ -1137,34 +1446,60 @@ def test_bbox_convert_jit(self): class TestBoxArea: - def area_check(self, box, expected, atol=1e-4): - out = ops.box_area(box) + def area_check(self, box, expected, fmt="xyxy", atol=1e-4): + out = ops.box_area(box, fmt=fmt) torch.testing.assert_close(out, expected, rtol=0.0, check_dtype=False, atol=atol) @pytest.mark.parametrize("dtype", [torch.int8, torch.int16, torch.int32, torch.int64]) - def test_int_boxes(self, dtype): - box_tensor = torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=dtype) + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_int_boxes(self, dtype, fmt): + box_tensor = ops.box_convert( + torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=dtype), in_fmt="xyxy", out_fmt=fmt + ) expected = torch.tensor([10000, 0], dtype=torch.int32) - self.area_check(box_tensor, expected) + self.area_check(box_tensor, expected, fmt) @pytest.mark.parametrize("dtype", [torch.float32, torch.float64]) - def test_float_boxes(self, dtype): - box_tensor = torch.tensor(FLOAT_BOXES, dtype=dtype) + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_float_boxes(self, dtype, fmt): + box_tensor = ops.box_convert(torch.tensor(FLOAT_BOXES, dtype=dtype), in_fmt="xyxy", out_fmt=fmt) expected = torch.tensor([604723.0806, 600965.4666, 592761.0085], dtype=dtype) - self.area_check(box_tensor, expected) - - def test_float16_box(self): - box_tensor = torch.tensor( - [[2.825, 1.8625, 3.90, 4.85], [2.825, 4.875, 19.20, 5.10], [2.925, 1.80, 8.90, 4.90]], dtype=torch.float16 + self.area_check(box_tensor, expected, fmt) + + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_float16_box(self, fmt): + box_tensor = ops.box_convert( + torch.tensor( + [[2.825, 1.8625, 3.90, 4.85], [2.825, 4.875, 19.20, 5.10], [2.925, 1.80, 8.90, 4.90]], + dtype=torch.float16, + ), + in_fmt="xyxy", + out_fmt=fmt, ) expected = torch.tensor([3.2170, 3.7108, 18.5071], dtype=torch.float16) - self.area_check(box_tensor, expected, atol=0.01) + self.area_check(box_tensor, expected, fmt, atol=0.01) + + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_box_area_jit(self, fmt): + box_tensor = ops.box_convert( + torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=torch.float), in_fmt="xyxy", out_fmt=fmt + ) + expected = ops.box_area(box_tensor, fmt) + + class BoxArea(torch.nn.Module): + # We are using this intermediate class + # since torchscript does not support + # neither partial nor lambda functions for this test. + def __init__(self, fmt): + super().__init__() + self.area = ops.box_area + self.fmt = fmt + + def forward(self, boxes): + return self.area(boxes, self.fmt) - def test_box_area_jit(self): - box_tensor = torch.tensor([[0, 0, 100, 100], [0, 0, 0, 0]], dtype=torch.float) - expected = ops.box_area(box_tensor) - scripted_fn = torch.jit.script(ops.box_area) + scripted_fn = torch.jit.script(BoxArea(fmt)) scripted_area = scripted_fn(box_tensor) torch.testing.assert_close(scripted_area, expected) @@ -1178,25 +1513,28 @@ def test_box_area_jit(self): ] -def gen_box(size, dtype=torch.float): +def gen_box(size, dtype=torch.float, fmt="xyxy") -> Tensor: xy1 = torch.rand((size, 2), dtype=dtype) xy2 = xy1 + torch.rand((size, 2), dtype=dtype) - return torch.cat([xy1, xy2], axis=-1) + return ops.box_convert(torch.cat([xy1, xy2], axis=-1), in_fmt="xyxy", out_fmt=fmt) class TestIouBase: @staticmethod - def _run_test(target_fn: Callable, actual_box1, actual_box2, dtypes, atol, expected): + def _run_test(target_fn: Callable, actual_box1, actual_box2, dtypes, atol, expected, fmt="xyxy"): for dtype in dtypes: - actual_box1 = torch.tensor(actual_box1, dtype=dtype) - actual_box2 = torch.tensor(actual_box2, dtype=dtype) + _actual_box1 = ops.box_convert(torch.tensor(actual_box1, dtype=dtype), in_fmt="xyxy", out_fmt=fmt) + _actual_box2 = ops.box_convert(torch.tensor(actual_box2, dtype=dtype), in_fmt="xyxy", out_fmt=fmt) expected_box = torch.tensor(expected) - out = target_fn(actual_box1, actual_box2) + out = target_fn( + _actual_box1, + _actual_box2, + ) torch.testing.assert_close(out, expected_box, rtol=0.0, check_dtype=False, atol=atol) @staticmethod - def _run_jit_test(target_fn: Callable, actual_box: List): - box_tensor = torch.tensor(actual_box, dtype=torch.float) + def _run_jit_test(target_fn: Callable, actual_box: list, fmt="xyxy"): + box_tensor = ops.box_convert(torch.tensor(actual_box, dtype=torch.float), in_fmt="xyxy", out_fmt=fmt) expected = target_fn(box_tensor, box_tensor) scripted_fn = torch.jit.script(target_fn) scripted_out = scripted_fn(box_tensor, box_tensor) @@ -1213,12 +1551,20 @@ def _cartesian_product(boxes1, boxes2, target_fn: Callable): return result @staticmethod - def _run_cartesian_test(target_fn: Callable): - boxes1 = gen_box(5) - boxes2 = gen_box(7) + def _run_cartesian_test(target_fn: Callable, fmt: str = "xyxy"): + boxes1 = gen_box(5, fmt=fmt) + boxes2 = gen_box(7, fmt=fmt) a = TestIouBase._cartesian_product(boxes1, boxes2, target_fn) b = target_fn(boxes1, boxes2) - assert torch.allclose(a, b) + torch.testing.assert_close(a, b) + + @staticmethod + def _run_batch_test(target_fn: Callable, fmt: str = "xyxy"): + boxes1 = torch.stack([gen_box(5, fmt=fmt) for _ in range(3)], dim=0) + boxes2 = torch.stack([gen_box(5, fmt=fmt) for _ in range(3)], dim=0) + native: Tensor = target_fn(boxes1, boxes2) + iterative: Tensor = torch.stack([target_fn(*pairs) for pairs in zip(boxes1, boxes2)], dim=0) + torch.testing.assert_close(native, iterative) class TestBoxIou(TestIouBase): @@ -1233,14 +1579,33 @@ class TestBoxIou(TestIouBase): pytest.param(FLOAT_BOXES, FLOAT_BOXES, [torch.float32, torch.float64], 1e-3, float_expected), ], ) - def test_iou(self, actual_box1, actual_box2, dtypes, atol, expected): - self._run_test(ops.box_iou, actual_box1, actual_box2, dtypes, atol, expected) + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_iou(self, actual_box1, actual_box2, dtypes, atol, expected, fmt): + self._run_test(partial(ops.box_iou, fmt=fmt), actual_box1, actual_box2, dtypes, atol, expected, fmt) - def test_iou_jit(self): - self._run_jit_test(ops.box_iou, INT_BOXES) + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_iou_jit(self, fmt): + class IoUJit(torch.nn.Module): + # We are using this intermediate class + # since torchscript does not support + # neither partial nor lambda functions for this test. + def __init__(self, fmt): + super().__init__() + self.iou = ops.box_iou + self.fmt = fmt - def test_iou_cartesian(self): - self._run_cartesian_test(ops.box_iou) + def forward(self, boxes1, boxes2): + return self.iou(boxes1, boxes2, fmt=self.fmt) + + self._run_jit_test(IoUJit(fmt=fmt), INT_BOXES, fmt) + + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_iou_cartesian(self, fmt): + self._run_cartesian_test(partial(ops.box_iou, fmt=fmt)) + + @pytest.mark.parametrize("fmt", ["xyxy", "xywh", "cxcywh"]) + def test_iou_batch(self, fmt): + self._run_batch_test(partial(ops.box_iou, fmt=fmt)) class TestGeneralizedBoxIou(TestIouBase): @@ -1264,6 +1629,9 @@ def test_iou_jit(self): def test_iou_cartesian(self): self._run_cartesian_test(ops.generalized_box_iou) + def test_iou_batch(self): + self._run_batch_test(ops.generalized_box_iou) + class TestDistanceBoxIoU(TestIouBase): int_expected = [ @@ -1291,6 +1659,9 @@ def test_iou_jit(self): def test_iou_cartesian(self): self._run_cartesian_test(ops.distance_box_iou) + def test_iou_batch(self): + self._run_batch_test(ops.distance_box_iou) + class TestCompleteBoxIou(TestIouBase): int_expected = [ @@ -1318,6 +1689,9 @@ def test_iou_jit(self): def test_iou_cartesian(self): self._run_cartesian_test(ops.complete_box_iou) + def test_iou_batch(self): + self._run_batch_test(ops.complete_box_iou) + def get_boxes(dtype, device): box1 = torch.tensor([-1, -1, 1, 1], dtype=dtype, device=device) @@ -1351,10 +1725,9 @@ def assert_empty_loss(iou_fn, dtype, device): class TestGeneralizedBoxIouLoss: # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_giou_loss(self, dtype, device): - box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device) # Identical boxes should have loss of 0 @@ -1375,7 +1748,12 @@ def test_giou_loss(self, dtype, device): assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, device=device, reduction="sum") assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, device=device, reduction="mean") - @pytest.mark.parametrize("device", cpu_and_gpu()) + # Test reduction value + # reduction value other than ["none", "mean", "sum"] should raise a ValueError + with pytest.raises(ValueError, match="Invalid"): + ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz") + + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_inputs(self, dtype, device): assert_empty_loss(ops.generalized_box_iou_loss, dtype, device) @@ -1383,7 +1761,7 @@ def test_empty_inputs(self, dtype, device): class TestCompleteBoxIouLoss: @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_ciou_loss(self, dtype, device): box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device) @@ -1394,14 +1772,17 @@ def test_ciou_loss(self, dtype, device): assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean") assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum") - @pytest.mark.parametrize("device", cpu_and_gpu()) + with pytest.raises(ValueError, match="Invalid"): + ops.complete_box_iou_loss(box1s, box2s, reduction="xyz") + + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_inputs(self, dtype, device): assert_empty_loss(ops.complete_box_iou_loss, dtype, device) class TestDistanceBoxIouLoss: - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_distance_iou_loss(self, dtype, device): box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device) @@ -1413,7 +1794,10 @@ def test_distance_iou_loss(self, dtype, device): assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean") assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum") - @pytest.mark.parametrize("device", cpu_and_gpu()) + with pytest.raises(ValueError, match="Invalid"): + ops.distance_box_iou_loss(box1s, box2s, reduction="xyz") + + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_distance_iou_inputs(self, dtype, device): assert_empty_loss(ops.distance_box_iou_loss, dtype, device) @@ -1458,7 +1842,7 @@ def generate_tensor_with_range_type(shape, range_type, **kwargs): @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0]) @pytest.mark.parametrize("gamma", [0, 2]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) @pytest.mark.parametrize("seed", [0, 1]) def test_correct_ratio(self, alpha, gamma, device, dtype, seed): @@ -1487,7 +1871,7 @@ def test_correct_ratio(self, alpha, gamma, device, dtype, seed): torch.testing.assert_close(correct_ratio, loss_ratio, atol=tol, rtol=tol) @pytest.mark.parametrize("reduction", ["mean", "sum"]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) @pytest.mark.parametrize("seed", [2, 3]) def test_equal_ce_loss(self, reduction, device, dtype, seed): @@ -1514,7 +1898,7 @@ def test_equal_ce_loss(self, reduction, device, dtype, seed): @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0]) @pytest.mark.parametrize("gamma", [0, 2]) @pytest.mark.parametrize("reduction", ["none", "mean", "sum"]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) @pytest.mark.parametrize("seed", [4, 5]) def test_jit(self, alpha, gamma, reduction, device, dtype, seed): @@ -1524,17 +1908,22 @@ def test_jit(self, alpha, gamma, reduction, device, dtype, seed): torch.random.manual_seed(seed) inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device) focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) - if device == "cpu": - scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) - else: - with torch.jit.fuser("fuser2"): - # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476 - # We may remove this condition once the bug is resolved - scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) + scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) tol = 1e-3 if dtype is torch.half else 1e-5 torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol) + # Raise ValueError for anonymous reduction mode + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) + def test_reduction_mode(self, device, dtype, reduction="xyz"): + if device == "cpu" and dtype is torch.half: + pytest.skip("Currently torch.half is not fully supported on cpu") + torch.random.manual_seed(0) + inputs, targets = self._generate_diverse_input_target_pair(device=device, dtype=dtype) + with pytest.raises(ValueError, match="Invalid"): + ops.sigmoid_focal_loss(inputs, targets, 0.25, 2, reduction) + class TestMasksToBoxes: def test_masks_box(self): @@ -1604,7 +1993,7 @@ def test_stochastic_depth_random(self, seed, mode, p): counts += batch_size - non_zero_count num_samples += batch_size - p_value = stats.binom_test(counts, num_samples, p=p) + p_value = stats.binomtest(counts, num_samples, p=p).pvalue assert p_value > 0.01 @pytest.mark.parametrize("seed", range(10)) diff --git a/test/test_prototype_builtin_datasets.py b/test/test_prototype_datasets_builtin.py similarity index 67% rename from test/test_prototype_builtin_datasets.py rename to test/test_prototype_datasets_builtin.py index 6ddba1806c6..5f8fc90debf 100644 --- a/test/test_prototype_builtin_datasets.py +++ b/test/test_prototype_datasets_builtin.py @@ -1,28 +1,54 @@ -import functools import io import pickle +from collections import deque from pathlib import Path import pytest import torch +import torchvision.transforms.v2 as transforms + from builtin_dataset_mocks import DATASET_MOCKS, parametrize_dataset_mocks -from torch.testing._comparison import assert_equal, ObjectPair, TensorLikePair +from torch.testing._comparison import not_close_error_metas, ObjectPair, TensorLikePair + +# TODO: replace with torchdata.dataloader2.DataLoader2 as soon as it is stable-ish from torch.utils.data import DataLoader -from torch.utils.data.graph import traverse + +# TODO: replace with torchdata equivalent as soon as it is available from torch.utils.data.graph_settings import get_all_graph_pipes + +from torchdata.dataloader2.graph.utils import traverse_dps from torchdata.datapipes.iter import ShardingFilter, Shuffler +from torchdata.datapipes.utils import StreamWrapper +from torchvision import tv_tensors from torchvision._utils import sequence_to_str -from torchvision.prototype import datasets, transforms +from torchvision.prototype import datasets +from torchvision.prototype.datasets.utils import EncodedImage from torchvision.prototype.datasets.utils._internal import INFINITE_BUFFER_SIZE -from torchvision.prototype.features import Image, Label +from torchvision.prototype.tv_tensors import Label +from torchvision.transforms.v2._utils import is_pure_tensor + -assert_samples_equal = functools.partial( - assert_equal, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True -) +def assert_samples_equal(*args, msg=None, **kwargs): + error_metas = not_close_error_metas( + *args, pair_types=(TensorLikePair, ObjectPair), rtol=0, atol=0, equal_nan=True, **kwargs + ) + if error_metas: + raise error_metas[0].to_error(msg) def extract_datapipes(dp): - return get_all_graph_pipes(traverse(dp, only_datapipe=True)) + return get_all_graph_pipes(traverse_dps(dp)) + + +def consume(iterator): + # Copied from the official itertools recipes: https://docs.python.org/3/library/itertools.html#itertools-recipes + deque(iterator, maxlen=0) + + +def next_consume(iterator): + item = next(iterator) + consume(iterator) + return item @pytest.fixture(autouse=True) @@ -66,7 +92,7 @@ def test_sample(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) try: - sample = next(iter(dataset)) + sample = next_consume(iter(dataset)) except StopIteration: raise AssertionError("Unable to draw any sample.") from None except Exception as error: @@ -84,29 +110,64 @@ def test_num_samples(self, dataset_mock, config): assert len(list(dataset)) == mock_info["num_samples"] + @pytest.fixture + def log_session_streams(self): + debug_unclosed_streams = StreamWrapper.debug_unclosed_streams + try: + StreamWrapper.debug_unclosed_streams = True + yield + finally: + StreamWrapper.debug_unclosed_streams = debug_unclosed_streams + + @parametrize_dataset_mocks(DATASET_MOCKS) + def test_stream_closing(self, log_session_streams, dataset_mock, config): + def make_msg_and_close(head): + unclosed_streams = [] + for stream in list(StreamWrapper.session_streams.keys()): + unclosed_streams.append(repr(stream.file_obj)) + stream.close() + unclosed_streams = "\n".join(unclosed_streams) + return f"{head}\n\n{unclosed_streams}" + + if StreamWrapper.session_streams: + raise pytest.UsageError(make_msg_and_close("A previous test did not close the following streams:")) + + dataset, _ = dataset_mock.load(config) + + consume(iter(dataset)) + + if StreamWrapper.session_streams: + raise AssertionError(make_msg_and_close("The following streams were not closed after a full iteration:")) + @parametrize_dataset_mocks(DATASET_MOCKS) - def test_no_vanilla_tensors(self, dataset_mock, config): + def test_no_unaccompanied_pure_tensors(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) + sample = next_consume(iter(dataset)) + + pure_tensors = {key for key, value in sample.items() if is_pure_tensor(value)} - vanilla_tensors = {key for key, value in next(iter(dataset)).items() if type(value) is torch.Tensor} - if vanilla_tensors: + if pure_tensors and not any( + isinstance(item, (tv_tensors.Image, tv_tensors.Video, EncodedImage)) for item in sample.values() + ): raise AssertionError( f"The values of key(s) " - f"{sequence_to_str(sorted(vanilla_tensors), separate_last='and ')} contained vanilla tensors." + f"{sequence_to_str(sorted(pure_tensors), separate_last='and ')} contained pure tensors, " + f"but didn't find any (encoded) image or video." ) @parametrize_dataset_mocks(DATASET_MOCKS) def test_transformable(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - next(iter(dataset.map(transforms.Identity()))) + dataset = dataset.map(transforms.Identity()) + + consume(iter(dataset)) - @pytest.mark.parametrize("only_datapipe", [False, True]) @parametrize_dataset_mocks(DATASET_MOCKS) - def test_traversable(self, dataset_mock, config, only_datapipe): + def test_traversable(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - traverse(dataset, only_datapipe=only_datapipe) + traverse_dps(dataset) @parametrize_dataset_mocks(DATASET_MOCKS) def test_serializable(self, dataset_mock, config): @@ -132,7 +193,7 @@ def test_data_loader(self, dataset_mock, config, num_workers): collate_fn=self._collate_fn, ) - next(iter(dl)) + consume(dl) # TODO: we need to enforce not only that both a Shuffler and a ShardingFilter are part of the datapipe, but also # that the Shuffler comes before the ShardingFilter. Early commits in https://github.com/pytorch/vision/pull/5680 @@ -149,12 +210,12 @@ def test_has_annotations(self, dataset_mock, config, annotation_dp_type): def test_save_load(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - sample = next(iter(dataset)) + sample = next_consume(iter(dataset)) with io.BytesIO() as buffer: torch.save(sample, buffer) buffer.seek(0) - assert_samples_equal(torch.load(buffer), sample) + assert_samples_equal(torch.load(buffer, weights_only=True), sample) @parametrize_dataset_mocks(DATASET_MOCKS) def test_infinite_buffer_size(self, dataset_mock, config): @@ -178,7 +239,7 @@ class TestQMNIST: def test_extra_label(self, dataset_mock, config): dataset, _ = dataset_mock.load(config) - sample = next(iter(dataset)) + sample = next_consume(iter(dataset)) for key, type in ( ("nist_hsf_series", int), ("nist_writer_id", int), @@ -215,7 +276,7 @@ def test_sample_content(self, dataset_mock, config): assert "image" in sample assert "label" in sample - assert isinstance(sample["image"], Image) + assert isinstance(sample["image"], tv_tensors.Image) assert isinstance(sample["label"], Label) assert sample["image"].shape == (1, 16, 16) diff --git a/test/test_prototype_models.py b/test/test_prototype_models.py index 56f7b9cb6ac..d32df68f1f4 100644 --- a/test/test_prototype_models.py +++ b/test/test_prototype_models.py @@ -1,13 +1,13 @@ import pytest import test_models as TM import torch -from common_utils import cpu_and_gpu, set_rng_seed +from common_utils import cpu_and_cuda, set_rng_seed from torchvision.prototype import models -@pytest.mark.parametrize("model_fn", TM.list_model_fns(models.depth.stereo)) +@pytest.mark.parametrize("model_fn", (models.depth.stereo.raft_stereo_base,)) @pytest.mark.parametrize("model_mode", ("standard", "scripted")) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_raft_stereo(model_fn, model_mode, dev): # A simple test to make sure the model can do forward pass and jit scriptable set_rng_seed(0) @@ -36,3 +36,49 @@ def test_raft_stereo(model_fn, model_mode, dev): # Test against expected file output TM._assert_expected(depth_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize("model_fn", (models.depth.stereo.crestereo_base,)) +@pytest.mark.parametrize("model_mode", ("standard", "scripted")) +@pytest.mark.parametrize("dev", cpu_and_cuda()) +def test_crestereo(model_fn, model_mode, dev): + set_rng_seed(0) + + model = model_fn().eval().to(dev) + + if model_mode == "scripted": + model = torch.jit.script(model) + + img1 = torch.rand(1, 3, 64, 64).to(dev) + img2 = torch.rand(1, 3, 64, 64).to(dev) + iterations = 3 + + preds = model(img1, img2, flow_init=None, num_iters=iterations) + disparity_pred = preds[-1] + + # all the pyramid levels except the highest res make only half the number of iterations + expected_iterations = (iterations // 2) * (len(model.resolutions) - 1) + expected_iterations += iterations + assert ( + len(preds) == expected_iterations + ), "Number of predictions should be the number of iterations multiplied by the number of pyramid levels" + + assert disparity_pred.shape == torch.Size( + [1, 2, 64, 64] + ), f"Predicted disparity should have the same spatial shape as the input. Inputs shape {img1.shape[2:]}, Prediction shape {disparity_pred.shape[2:]}" + + assert all( + d.shape == torch.Size([1, 2, 64, 64]) for d in preds + ), "All predicted disparities are expected to have the same shape" + + # test a backward pass with a dummy loss as well + preds = torch.stack(preds, dim=0) + targets = torch.ones_like(preds, requires_grad=False) + loss = torch.nn.functional.mse_loss(preds, targets) + + try: + loss.backward() + except Exception as e: + assert False, f"Backward pass failed with an unexpected exception: {e.__class__.__name__} {e}" + + TM._assert_expected(disparity_pred, name=model_fn.__name__, atol=1e-2, rtol=1e-2) diff --git a/test/test_prototype_transforms.py b/test/test_prototype_transforms.py index 0f485a10550..85ef98cf7b8 100644 --- a/test/test_prototype_transforms.py +++ b/test/test_prototype_transforms.py @@ -1,1316 +1,42 @@ -import itertools - -import numpy as np +import collections.abc +import re import PIL.Image - import pytest import torch -from common_utils import assert_equal, cpu_and_gpu -from test_prototype_transforms_functional import ( - make_bounding_box, - make_bounding_boxes, - make_image, - make_images, - make_label, - make_one_hot_labels, - make_segmentation_mask, -) -from torchvision.ops.boxes import box_iou -from torchvision.prototype import features, transforms -from torchvision.transforms.functional import InterpolationMode, pil_to_tensor, to_pil_image - - -def make_vanilla_tensor_images(*args, **kwargs): - for image in make_images(*args, **kwargs): - if image.ndim > 3: - continue - yield image.data - - -def make_pil_images(*args, **kwargs): - for image in make_vanilla_tensor_images(*args, **kwargs): - yield to_pil_image(image) - - -def make_vanilla_tensor_bounding_boxes(*args, **kwargs): - for bounding_box in make_bounding_boxes(*args, **kwargs): - yield bounding_box.data - - -def parametrize(transforms_with_inputs): - return pytest.mark.parametrize( - ("transform", "input"), - [ - pytest.param( - transform, - input, - id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}", - ) - for transform, inputs in transforms_with_inputs - for idx, input in enumerate(inputs) - ], - ) - - -def parametrize_from_transforms(*transforms): - transforms_with_inputs = [] - for transform in transforms: - for creation_fn in [ - make_images, - make_bounding_boxes, - make_one_hot_labels, - make_vanilla_tensor_images, - make_pil_images, - ]: - inputs = list(creation_fn()) - try: - output = transform(inputs[0]) - except Exception: - continue - else: - if output is inputs[0]: - continue - - transforms_with_inputs.append((transform, inputs)) - - return parametrize(transforms_with_inputs) - - -class TestSmoke: - @parametrize_from_transforms( - transforms.RandomErasing(p=1.0), - transforms.Resize([16, 16]), - transforms.CenterCrop([16, 16]), - transforms.ConvertImageDtype(), - transforms.RandomHorizontalFlip(), - transforms.Pad(5), - transforms.RandomZoomOut(), - transforms.RandomRotation(degrees=(-45, 45)), - transforms.RandomAffine(degrees=(-45, 45)), - transforms.RandomCrop([16, 16], padding=1, pad_if_needed=True), - # TODO: Something wrong with input data setup. Let's fix that - # transforms.RandomEqualize(), - # transforms.RandomInvert(), - # transforms.RandomPosterize(bits=4), - # transforms.RandomSolarize(threshold=0.5), - # transforms.RandomAdjustSharpness(sharpness_factor=0.5), - ) - def test_common(self, transform, input): - transform(input) - - @parametrize( - [ - ( - transform, - [ - dict( - image=features.Image.new_like(image, image.unsqueeze(0), dtype=torch.float), - one_hot_label=features.OneHotLabel.new_like( - one_hot_label, one_hot_label.unsqueeze(0), dtype=torch.float - ), - ) - for image, one_hot_label in itertools.product(make_images(), make_one_hot_labels()) - ], - ) - for transform in [ - transforms.RandomMixup(alpha=1.0), - transforms.RandomCutmix(alpha=1.0), - ] - ] - ) - def test_mixup_cutmix(self, transform, input): - transform(input) - - # add other data that should bypass and wont raise any error - input_copy = dict(input) - input_copy["path"] = "/path/to/somewhere" - input_copy["num"] = 1234 - transform(input_copy) - - # Check if we raise an error if sample contains bbox or mask or label - err_msg = "does not support bounding boxes, segmentation masks and plain labels" - input_copy = dict(input) - for unsup_data in [make_label(), make_bounding_box(format="XYXY"), make_segmentation_mask()]: - input_copy["unsupported"] = unsup_data - with pytest.raises(TypeError, match=err_msg): - transform(input_copy) - - @parametrize( - [ - ( - transform, - itertools.chain.from_iterable( - fn( - color_spaces=[ - features.ColorSpace.GRAY, - features.ColorSpace.RGB, - ], - dtypes=[torch.uint8], - extra_dims=[(4,)], - ) - for fn in [ - make_images, - make_vanilla_tensor_images, - make_pil_images, - ] - ), - ) - for transform in ( - transforms.RandAugment(), - transforms.TrivialAugmentWide(), - transforms.AutoAugment(), - transforms.AugMix(), - ) - ] - ) - def test_auto_augment(self, transform, input): - transform(input) - - @parametrize( - [ - ( - transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]), - itertools.chain.from_iterable( - fn(color_spaces=[features.ColorSpace.RGB], dtypes=[torch.float32]) - for fn in [ - make_images, - make_vanilla_tensor_images, - ] - ), - ), - ] - ) - def test_normalize(self, transform, input): - transform(input) - - @parametrize( - [ - ( - transforms.RandomResizedCrop([16, 16]), - itertools.chain( - make_images(extra_dims=[(4,)]), - make_vanilla_tensor_images(), - make_pil_images(), - ), - ) - ] - ) - def test_random_resized_crop(self, transform, input): - transform(input) - - @parametrize( - [ - ( - transforms.ConvertColorSpace(color_space=new_color_space, old_color_space=old_color_space), - itertools.chain.from_iterable( - [ - fn(color_spaces=[old_color_space]) - for fn in ( - make_images, - make_vanilla_tensor_images, - make_pil_images, - ) - ] - ), - ) - for old_color_space, new_color_space in itertools.product( - [ - features.ColorSpace.GRAY, - features.ColorSpace.GRAY_ALPHA, - features.ColorSpace.RGB, - features.ColorSpace.RGB_ALPHA, - ], - repeat=2, - ) - ] - ) - def test_convertolor_space(self, transform, input): - transform(input) - - -@pytest.mark.parametrize("p", [0.0, 1.0]) -class TestRandomHorizontalFlip: - def input_expected_image_tensor(self, p, dtype=torch.float32): - input = torch.tensor([[[0, 1], [0, 1]], [[1, 0], [1, 0]]], dtype=dtype) - expected = torch.tensor([[[1, 0], [1, 0]], [[0, 1], [0, 1]]], dtype=dtype) - - return input, expected if p == 1 else input - - def test_simple_tensor(self, p): - input, expected = self.input_expected_image_tensor(p) - transform = transforms.RandomHorizontalFlip(p=p) - - actual = transform(input) - - assert_equal(expected, actual) - - def test_pil_image(self, p): - input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8) - transform = transforms.RandomHorizontalFlip(p=p) - - actual = transform(to_pil_image(input)) - - assert_equal(expected, pil_to_tensor(actual)) - - def test_features_image(self, p): - input, expected = self.input_expected_image_tensor(p) - transform = transforms.RandomHorizontalFlip(p=p) - - actual = transform(features.Image(input)) - - assert_equal(features.Image(expected), actual) - - def test_features_segmentation_mask(self, p): - input, expected = self.input_expected_image_tensor(p) - transform = transforms.RandomHorizontalFlip(p=p) - - actual = transform(features.SegmentationMask(input)) - - assert_equal(features.SegmentationMask(expected), actual) - - def test_features_bounding_box(self, p): - input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10)) - transform = transforms.RandomHorizontalFlip(p=p) - - actual = transform(input) - - expected_image_tensor = torch.tensor([5, 0, 10, 5]) if p == 1.0 else input - expected = features.BoundingBox.new_like(input, data=expected_image_tensor) - assert_equal(expected, actual) - assert actual.format == expected.format - assert actual.image_size == expected.image_size - - -@pytest.mark.parametrize("p", [0.0, 1.0]) -class TestRandomVerticalFlip: - def input_expected_image_tensor(self, p, dtype=torch.float32): - input = torch.tensor([[[1, 1], [0, 0]], [[1, 1], [0, 0]]], dtype=dtype) - expected = torch.tensor([[[0, 0], [1, 1]], [[0, 0], [1, 1]]], dtype=dtype) - - return input, expected if p == 1 else input - - def test_simple_tensor(self, p): - input, expected = self.input_expected_image_tensor(p) - transform = transforms.RandomVerticalFlip(p=p) - - actual = transform(input) - - assert_equal(expected, actual) - - def test_pil_image(self, p): - input, expected = self.input_expected_image_tensor(p, dtype=torch.uint8) - transform = transforms.RandomVerticalFlip(p=p) - - actual = transform(to_pil_image(input)) - - assert_equal(expected, pil_to_tensor(actual)) - - def test_features_image(self, p): - input, expected = self.input_expected_image_tensor(p) - transform = transforms.RandomVerticalFlip(p=p) - - actual = transform(features.Image(input)) - - assert_equal(features.Image(expected), actual) - - def test_features_segmentation_mask(self, p): - input, expected = self.input_expected_image_tensor(p) - transform = transforms.RandomVerticalFlip(p=p) - - actual = transform(features.SegmentationMask(input)) - - assert_equal(features.SegmentationMask(expected), actual) - - def test_features_bounding_box(self, p): - input = features.BoundingBox([0, 0, 5, 5], format=features.BoundingBoxFormat.XYXY, image_size=(10, 10)) - transform = transforms.RandomVerticalFlip(p=p) - - actual = transform(input) - - expected_image_tensor = torch.tensor([0, 5, 5, 10]) if p == 1.0 else input - expected = features.BoundingBox.new_like(input, data=expected_image_tensor) - assert_equal(expected, actual) - assert actual.format == expected.format - assert actual.image_size == expected.image_size - - -class TestPad: - def test_assertions(self): - with pytest.raises(TypeError, match="Got inappropriate padding arg"): - transforms.Pad("abc") - - with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): - transforms.Pad([-0.7, 0, 0.7]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.Pad(12, fill="abc") - - with pytest.raises(ValueError, match="Padding mode should be either"): - transforms.Pad(12, padding_mode="abc") - - @pytest.mark.parametrize("padding", [1, (1, 2), [1, 2, 3, 4]]) - @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) - @pytest.mark.parametrize("padding_mode", ["constant", "edge"]) - def test__transform(self, padding, fill, padding_mode, mocker): - transform = transforms.Pad(padding, fill=fill, padding_mode=padding_mode) - - fn = mocker.patch("torchvision.prototype.transforms.functional.pad") - inpt = mocker.MagicMock(spec=features.Image) - _ = transform(inpt) - - fn.assert_called_once_with(inpt, padding=padding, fill=fill, padding_mode=padding_mode) - -class TestRandomZoomOut: - def test_assertions(self): - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomZoomOut(fill="abc") +from common_utils import assert_equal, make_bounding_boxes, make_detection_masks, make_image, make_video - with pytest.raises(TypeError, match="should be a sequence of length"): - transforms.RandomZoomOut(0, side_range=0) +from torchvision.prototype import transforms, tv_tensors +from torchvision.transforms.v2._utils import check_type, is_pure_tensor +from torchvision.transforms.v2.functional import clamp_bounding_boxes, InterpolationMode, pil_to_tensor, to_pil_image - with pytest.raises(ValueError, match="Invalid canvas side range"): - transforms.RandomZoomOut(0, side_range=[4.0, 1.0]) +from torchvision.tv_tensors import BoundingBoxes, BoundingBoxFormat, Image, Mask, Video - @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) - @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) - def test__get_params(self, fill, side_range, mocker): - transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) - image = mocker.MagicMock(spec=features.Image) - h, w = image.image_size = (24, 32) - - params = transform._get_params(image) - - assert params["fill"] == fill - assert len(params["padding"]) == 4 - assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w - assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h - assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w - assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h - - @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) - @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) - def test__transform(self, fill, side_range, mocker): - inpt = mocker.MagicMock(spec=features.Image) - inpt.num_channels = 3 - inpt.image_size = (24, 32) - - transform = transforms.RandomZoomOut(fill=fill, side_range=side_range, p=1) - - fn = mocker.patch("torchvision.prototype.transforms.functional.pad") - # vfdev-5, Feature Request: let's store params as Transform attribute - # This could be also helpful for users - # Otherwise, we can mock transform._get_params - torch.manual_seed(12) - _ = transform(inpt) - torch.manual_seed(12) - torch.rand(1) # random apply changes random state - params = transform._get_params(inpt) - - fn.assert_called_once_with(inpt, **params) - - -class TestRandomRotation: - def test_assertions(self): - with pytest.raises(ValueError, match="is a single number, it must be positive"): - transforms.RandomRotation(-0.7) - - for d in [[-0.7], [-0.7, 0, 0.7]]: - with pytest.raises(ValueError, match="degrees should be a sequence of length 2"): - transforms.RandomRotation(d) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomRotation(12, fill="abc") - - with pytest.raises(TypeError, match="center should be a sequence of length"): - transforms.RandomRotation(12, center=12) - - with pytest.raises(ValueError, match="center should be a sequence of length"): - transforms.RandomRotation(12, center=[1, 2, 3]) - - def test__get_params(self): - angle_bound = 34 - transform = transforms.RandomRotation(angle_bound) - - params = transform._get_params(None) - assert -angle_bound <= params["angle"] <= angle_bound - - angle_bounds = [12, 34] - transform = transforms.RandomRotation(angle_bounds) - - params = transform._get_params(None) - assert angle_bounds[0] <= params["angle"] <= angle_bounds[1] - - @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)]) - @pytest.mark.parametrize("expand", [False, True]) - @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) - @pytest.mark.parametrize("center", [None, [2.0, 3.0]]) - def test__transform(self, degrees, expand, fill, center, mocker): - interpolation = InterpolationMode.BILINEAR - transform = transforms.RandomRotation( - degrees, interpolation=interpolation, expand=expand, fill=fill, center=center +def _parse_categories(categories): + if categories is None: + num_categories = int(torch.randint(1, 11, ())) + elif isinstance(categories, int): + num_categories = categories + categories = [f"category{idx}" for idx in range(num_categories)] + elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories): + categories = list(categories) + num_categories = len(categories) + else: + raise pytest.UsageError( + f"`categories` can either be `None` (default), an integer, or a sequence of strings, " + f"but got '{categories}' instead." ) + return categories, num_categories - if isinstance(degrees, (tuple, list)): - assert transform.degrees == [float(degrees[0]), float(degrees[1])] - else: - assert transform.degrees == [float(-degrees), float(degrees)] - - fn = mocker.patch("torchvision.prototype.transforms.functional.rotate") - inpt = mocker.MagicMock(spec=features.Image) - # vfdev-5, Feature Request: let's store params as Transform attribute - # This could be also helpful for users - # Otherwise, we can mock transform._get_params - torch.manual_seed(12) - _ = transform(inpt) - torch.manual_seed(12) - params = transform._get_params(inpt) - - fn.assert_called_once_with(inpt, **params, interpolation=interpolation, expand=expand, fill=fill, center=center) - @pytest.mark.parametrize("angle", [34, -87]) - @pytest.mark.parametrize("expand", [False, True]) - def test_boundingbox_image_size(self, angle, expand): - # Specific test for BoundingBox.rotate - bbox = features.BoundingBox( - torch.tensor([1, 2, 3, 4]), format=features.BoundingBoxFormat.XYXY, image_size=(32, 32) - ) - img = features.Image(torch.rand(1, 3, 32, 32)) - - out_img = img.rotate(angle, expand=expand) - out_bbox = bbox.rotate(angle, expand=expand) - - assert out_img.image_size == out_bbox.image_size - - -class TestRandomAffine: - def test_assertions(self): - with pytest.raises(ValueError, match="is a single number, it must be positive"): - transforms.RandomAffine(-0.7) - - for d in [[-0.7], [-0.7, 0, 0.7]]: - with pytest.raises(ValueError, match="degrees should be a sequence of length 2"): - transforms.RandomAffine(d) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomAffine(12, fill="abc") - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomAffine(12, fill="abc") - - for kwargs in [ - {"center": 12}, - {"translate": 12}, - {"scale": 12}, - ]: - with pytest.raises(TypeError, match="should be a sequence of length"): - transforms.RandomAffine(12, **kwargs) - - for kwargs in [{"center": [1, 2, 3]}, {"translate": [1, 2, 3]}, {"scale": [1, 2, 3]}]: - with pytest.raises(ValueError, match="should be a sequence of length"): - transforms.RandomAffine(12, **kwargs) - - with pytest.raises(ValueError, match="translation values should be between 0 and 1"): - transforms.RandomAffine(12, translate=[-1.0, 2.0]) - - with pytest.raises(ValueError, match="scale values should be positive"): - transforms.RandomAffine(12, scale=[-1.0, 2.0]) - - with pytest.raises(ValueError, match="is a single number, it must be positive"): - transforms.RandomAffine(12, shear=-10) - - for s in [[-0.7], [-0.7, 0, 0.7]]: - with pytest.raises(ValueError, match="shear should be a sequence of length 2"): - transforms.RandomAffine(12, shear=s) - - @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)]) - @pytest.mark.parametrize("translate", [None, [0.1, 0.2]]) - @pytest.mark.parametrize("scale", [None, [0.7, 1.2]]) - @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]]) - def test__get_params(self, degrees, translate, scale, shear, mocker): - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - h, w = image.image_size - - transform = transforms.RandomAffine(degrees, translate=translate, scale=scale, shear=shear) - params = transform._get_params(image) - - if not isinstance(degrees, (list, tuple)): - assert -degrees <= params["angle"] <= degrees - else: - assert degrees[0] <= params["angle"] <= degrees[1] - - if translate is not None: - w_max = int(round(translate[0] * w)) - h_max = int(round(translate[1] * h)) - assert -w_max <= params["translations"][0] <= w_max - assert -h_max <= params["translations"][1] <= h_max - else: - assert params["translations"] == (0, 0) - - if scale is not None: - assert scale[0] <= params["scale"] <= scale[1] - else: - assert params["scale"] == 1.0 - - if shear is not None: - if isinstance(shear, float): - assert -shear <= params["shear"][0] <= shear - assert params["shear"][1] == 0.0 - elif len(shear) == 2: - assert shear[0] <= params["shear"][0] <= shear[1] - assert params["shear"][1] == 0.0 - else: - assert shear[0] <= params["shear"][0] <= shear[1] - assert shear[2] <= params["shear"][1] <= shear[3] - else: - assert params["shear"] == (0, 0) - - @pytest.mark.parametrize("degrees", [23, [0, 45], (0, 45)]) - @pytest.mark.parametrize("translate", [None, [0.1, 0.2]]) - @pytest.mark.parametrize("scale", [None, [0.7, 1.2]]) - @pytest.mark.parametrize("shear", [None, 2.0, [5.0, 15.0], [1.0, 2.0, 3.0, 4.0]]) - @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) - @pytest.mark.parametrize("center", [None, [2.0, 3.0]]) - def test__transform(self, degrees, translate, scale, shear, fill, center, mocker): - interpolation = InterpolationMode.BILINEAR - transform = transforms.RandomAffine( - degrees, - translate=translate, - scale=scale, - shear=shear, - interpolation=interpolation, - fill=fill, - center=center, - ) - - if isinstance(degrees, (tuple, list)): - assert transform.degrees == [float(degrees[0]), float(degrees[1])] - else: - assert transform.degrees == [float(-degrees), float(degrees)] - - fn = mocker.patch("torchvision.prototype.transforms.functional.affine") - inpt = mocker.MagicMock(spec=features.Image) - inpt.num_channels = 3 - inpt.image_size = (24, 32) - - # vfdev-5, Feature Request: let's store params as Transform attribute - # This could be also helpful for users - # Otherwise, we can mock transform._get_params - torch.manual_seed(12) - _ = transform(inpt) - torch.manual_seed(12) - params = transform._get_params(inpt) - - fn.assert_called_once_with(inpt, **params, interpolation=interpolation, fill=fill, center=center) - - -class TestRandomCrop: - def test_assertions(self): - with pytest.raises(ValueError, match="Please provide only two dimensions"): - transforms.RandomCrop([10, 12, 14]) - - with pytest.raises(TypeError, match="Got inappropriate padding arg"): - transforms.RandomCrop([10, 12], padding="abc") - - with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): - transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomCrop([10, 12], padding=1, fill="abc") - - with pytest.raises(ValueError, match="Padding mode should be either"): - transforms.RandomCrop([10, 12], padding=1, padding_mode="abc") - - @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]]) - @pytest.mark.parametrize("size, pad_if_needed", [((10, 10), False), ((50, 25), True)]) - def test__get_params(self, padding, pad_if_needed, size, mocker): - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - h, w = image.image_size - - transform = transforms.RandomCrop(size, padding=padding, pad_if_needed=pad_if_needed) - params = transform._get_params(image) - - if padding is not None: - if isinstance(padding, int): - h += 2 * padding - w += 2 * padding - elif isinstance(padding, list) and len(padding) == 2: - w += 2 * padding[0] - h += 2 * padding[1] - elif isinstance(padding, list) and len(padding) == 4: - w += padding[0] + padding[2] - h += padding[1] + padding[3] - - expected_input_width = w - expected_input_height = h - - if pad_if_needed: - if w < size[1]: - w += 2 * (size[1] - w) - if h < size[0]: - h += 2 * (size[0] - h) - - assert 0 <= params["top"] <= h - size[0] + 1 - assert 0 <= params["left"] <= w - size[1] + 1 - assert params["height"] == size[0] - assert params["width"] == size[1] - assert params["input_width"] == expected_input_width - assert params["input_height"] == expected_input_height - - @pytest.mark.parametrize("padding", [None, 1, [2, 3], [1, 2, 3, 4]]) - @pytest.mark.parametrize("pad_if_needed", [False, True]) - @pytest.mark.parametrize("fill", [False, True]) - @pytest.mark.parametrize("padding_mode", ["constant", "edge"]) - def test__transform(self, padding, pad_if_needed, fill, padding_mode, mocker): - output_size = [10, 12] - transform = transforms.RandomCrop( - output_size, padding=padding, pad_if_needed=pad_if_needed, fill=fill, padding_mode=padding_mode - ) - - inpt = mocker.MagicMock(spec=features.Image) - inpt.num_channels = 3 - inpt.image_size = (32, 32) - - expected = mocker.MagicMock(spec=features.Image) - expected.num_channels = 3 - if isinstance(padding, int): - expected.image_size = (inpt.image_size[0] + padding, inpt.image_size[1] + padding) - elif isinstance(padding, list): - expected.image_size = ( - inpt.image_size[0] + sum(padding[0::2]), - inpt.image_size[1] + sum(padding[1::2]), - ) - else: - expected.image_size = inpt.image_size - _ = mocker.patch("torchvision.prototype.transforms.functional.pad", return_value=expected) - fn_crop = mocker.patch("torchvision.prototype.transforms.functional.crop") - - # vfdev-5, Feature Request: let's store params as Transform attribute - # This could be also helpful for users - # Otherwise, we can mock transform._get_params - torch.manual_seed(12) - _ = transform(inpt) - torch.manual_seed(12) - params = transform._get_params(inpt) - if padding is None and not pad_if_needed: - fn_crop.assert_called_once_with( - inpt, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1] - ) - elif not pad_if_needed: - fn_crop.assert_called_once_with( - expected, top=params["top"], left=params["left"], height=output_size[0], width=output_size[1] - ) - elif padding is None: - # vfdev-5: I do not know how to mock and test this case - pass - else: - # vfdev-5: I do not know how to mock and test this case - pass - - -class TestGaussianBlur: - def test_assertions(self): - with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"): - transforms.GaussianBlur([10, 12, 14]) - - with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"): - transforms.GaussianBlur(4) - - with pytest.raises(TypeError, match="sigma should be a single float or a list/tuple with length 2"): - transforms.GaussianBlur(3, sigma=[1, 2, 3]) - - with pytest.raises(ValueError, match="If sigma is a single number, it must be positive"): - transforms.GaussianBlur(3, sigma=-1.0) - - with pytest.raises(ValueError, match="sigma values should be positive and of the form"): - transforms.GaussianBlur(3, sigma=[2.0, 1.0]) - - @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0]]) - def test__get_params(self, sigma): - transform = transforms.GaussianBlur(3, sigma=sigma) - params = transform._get_params(None) - - if isinstance(sigma, float): - assert params["sigma"][0] == params["sigma"][1] == 10 - else: - assert sigma[0] <= params["sigma"][0] <= sigma[1] - assert sigma[0] <= params["sigma"][1] <= sigma[1] - - @pytest.mark.parametrize("kernel_size", [3, [3, 5], (5, 3)]) - @pytest.mark.parametrize("sigma", [2.0, [2.0, 3.0]]) - def test__transform(self, kernel_size, sigma, mocker): - transform = transforms.GaussianBlur(kernel_size=kernel_size, sigma=sigma) - - if isinstance(kernel_size, (tuple, list)): - assert transform.kernel_size == kernel_size - else: - assert transform.kernel_size == (kernel_size, kernel_size) - - if isinstance(sigma, (tuple, list)): - assert transform.sigma == sigma - else: - assert transform.sigma == (sigma, sigma) - - fn = mocker.patch("torchvision.prototype.transforms.functional.gaussian_blur") - inpt = mocker.MagicMock(spec=features.Image) - inpt.num_channels = 3 - inpt.image_size = (24, 32) - - # vfdev-5, Feature Request: let's store params as Transform attribute - # This could be also helpful for users - # Otherwise, we can mock transform._get_params - torch.manual_seed(12) - _ = transform(inpt) - torch.manual_seed(12) - params = transform._get_params(inpt) - - fn.assert_called_once_with(inpt, **params) - - -class TestRandomColorOp: - @pytest.mark.parametrize("p", [0.0, 1.0]) - @pytest.mark.parametrize( - "transform_cls, func_op_name, kwargs", - [ - (transforms.RandomEqualize, "equalize", {}), - (transforms.RandomInvert, "invert", {}), - (transforms.RandomAutocontrast, "autocontrast", {}), - (transforms.RandomPosterize, "posterize", {"bits": 4}), - (transforms.RandomSolarize, "solarize", {"threshold": 0.5}), - (transforms.RandomAdjustSharpness, "adjust_sharpness", {"sharpness_factor": 0.5}), - ], - ) - def test__transform(self, p, transform_cls, func_op_name, kwargs, mocker): - transform = transform_cls(p=p, **kwargs) - - fn = mocker.patch(f"torchvision.prototype.transforms.functional.{func_op_name}") - inpt = mocker.MagicMock(spec=features.Image) - _ = transform(inpt) - if p > 0.0: - fn.assert_called_once_with(inpt, **kwargs) - else: - assert fn.call_count == 0 - - -class TestRandomPerspective: - def test_assertions(self): - with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"): - transforms.RandomPerspective(distortion_scale=-1.0) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.RandomPerspective(0.5, fill="abc") - - def test__get_params(self, mocker): - dscale = 0.5 - transform = transforms.RandomPerspective(dscale) - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - - params = transform._get_params(image) - - h, w = image.image_size - assert len(params["startpoints"]) == 4 - for x, y in params["startpoints"]: - assert x in (0, w - 1) - assert y in (0, h - 1) - - assert len(params["endpoints"]) == 4 - for (x, y), name in zip(params["endpoints"], ["tl", "tr", "br", "bl"]): - if "t" in name: - assert 0 <= y <= int(dscale * h // 2), (x, y, name) - if "b" in name: - assert h - int(dscale * h // 2) - 1 <= y <= h, (x, y, name) - if "l" in name: - assert 0 <= x <= int(dscale * w // 2), (x, y, name) - if "r" in name: - assert w - int(dscale * w // 2) - 1 <= x <= w, (x, y, name) - - @pytest.mark.parametrize("distortion_scale", [0.1, 0.7]) - def test__transform(self, distortion_scale, mocker): - interpolation = InterpolationMode.BILINEAR - fill = 12 - transform = transforms.RandomPerspective(distortion_scale, fill=fill, interpolation=interpolation) - - fn = mocker.patch("torchvision.prototype.transforms.functional.perspective") - inpt = mocker.MagicMock(spec=features.Image) - inpt.num_channels = 3 - inpt.image_size = (24, 32) - # vfdev-5, Feature Request: let's store params as Transform attribute - # This could be also helpful for users - # Otherwise, we can mock transform._get_params - torch.manual_seed(12) - _ = transform(inpt) - torch.manual_seed(12) - torch.rand(1) # random apply changes random state - params = transform._get_params(inpt) - - fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation) - - -class TestElasticTransform: - def test_assertions(self): - - with pytest.raises(TypeError, match="alpha should be float or a sequence of floats"): - transforms.ElasticTransform({}) - - with pytest.raises(ValueError, match="alpha is a sequence its length should be one of 2"): - transforms.ElasticTransform([1.0, 2.0, 3.0]) - - with pytest.raises(ValueError, match="alpha should be a sequence of floats"): - transforms.ElasticTransform([1, 2]) - - with pytest.raises(TypeError, match="sigma should be float or a sequence of floats"): - transforms.ElasticTransform(1.0, {}) - - with pytest.raises(ValueError, match="sigma is a sequence its length should be one of 2"): - transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0]) - - with pytest.raises(ValueError, match="sigma should be a sequence of floats"): - transforms.ElasticTransform(1.0, [1, 2]) - - with pytest.raises(TypeError, match="Got inappropriate fill arg"): - transforms.ElasticTransform(1.0, 2.0, fill="abc") - - def test__get_params(self, mocker): - alpha = 2.0 - sigma = 3.0 - transform = transforms.ElasticTransform(alpha, sigma) - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - - params = transform._get_params(image) - - h, w = image.image_size - displacement = params["displacement"] - assert displacement.shape == (1, h, w, 2) - assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() - assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all() - - @pytest.mark.parametrize("alpha", [5.0, [5.0, 10.0]]) - @pytest.mark.parametrize("sigma", [2.0, [2.0, 5.0]]) - def test__transform(self, alpha, sigma, mocker): - interpolation = InterpolationMode.BILINEAR - fill = 12 - transform = transforms.ElasticTransform(alpha, sigma=sigma, fill=fill, interpolation=interpolation) - - if isinstance(alpha, float): - assert transform.alpha == [alpha, alpha] - else: - assert transform.alpha == alpha - - if isinstance(sigma, float): - assert transform.sigma == [sigma, sigma] - else: - assert transform.sigma == sigma - - fn = mocker.patch("torchvision.prototype.transforms.functional.elastic") - inpt = mocker.MagicMock(spec=features.Image) - inpt.num_channels = 3 - inpt.image_size = (24, 32) - - # Let's mock transform._get_params to control the output: - transform._get_params = mocker.MagicMock() - _ = transform(inpt) - params = transform._get_params(inpt) - fn.assert_called_once_with(inpt, **params, fill=fill, interpolation=interpolation) - - -class TestRandomErasing: - def test_assertions(self, mocker): - with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): - transforms.RandomErasing(value={}) - - with pytest.raises(ValueError, match="If value is str, it should be 'random'"): - transforms.RandomErasing(value="abc") - - with pytest.raises(TypeError, match="Scale should be a sequence"): - transforms.RandomErasing(scale=123) - - with pytest.raises(TypeError, match="Ratio should be a sequence"): - transforms.RandomErasing(ratio=123) - - with pytest.raises(ValueError, match="Scale should be between 0 and 1"): - transforms.RandomErasing(scale=[-1, 2]) - - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - - transform = transforms.RandomErasing(value=[1, 2, 3, 4]) - - with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): - transform._get_params(image) - - @pytest.mark.parametrize("value", [5.0, [1, 2, 3], "random"]) - def test__get_params(self, value, mocker): - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - - transform = transforms.RandomErasing(value=value) - params = transform._get_params(image) - - v = params["v"] - h, w = params["h"], params["w"] - i, j = params["i"], params["j"] - assert isinstance(v, torch.Tensor) - if value == "random": - assert v.shape == (image.num_channels, h, w) - elif isinstance(value, (int, float)): - assert v.shape == (1, 1, 1) - elif isinstance(value, (list, tuple)): - assert v.shape == (image.num_channels, 1, 1) - - assert 0 <= i <= image.image_size[0] - h - assert 0 <= j <= image.image_size[1] - w - - @pytest.mark.parametrize("p", [0, 1]) - def test__transform(self, mocker, p): - transform = transforms.RandomErasing(p=p) - transform._transformed_types = (mocker.MagicMock,) - - i_sentinel = mocker.MagicMock() - j_sentinel = mocker.MagicMock() - h_sentinel = mocker.MagicMock() - w_sentinel = mocker.MagicMock() - v_sentinel = mocker.MagicMock() - mocker.patch( - "torchvision.prototype.transforms._augment.RandomErasing._get_params", - return_value=dict(i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel), - ) - - inpt_sentinel = mocker.MagicMock() - - mock = mocker.patch("torchvision.prototype.transforms._augment.F.erase") - output = transform(inpt_sentinel) - - if p: - mock.assert_called_once_with( - inpt_sentinel, i=i_sentinel, j=j_sentinel, h=h_sentinel, w=w_sentinel, v=v_sentinel - ) - else: - mock.assert_not_called() - assert output is inpt_sentinel - - -class TestTransform: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int], - ) - def test_check_transformed_types(self, inpt_type, mocker): - # This test ensures that we correctly handle which types to transform and which to bypass - t = transforms.Transform() - inpt = mocker.MagicMock(spec=inpt_type) - - if inpt_type in (np.ndarray, str, int): - output = t(inpt) - assert output is inpt - else: - with pytest.raises(NotImplementedError): - t(inpt) - - -class TestToImageTensor: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch( - "torchvision.prototype.transforms.functional.to_image_tensor", - return_value=torch.rand(1, 3, 8, 8), - ) - - inpt = mocker.MagicMock(spec=inpt_type) - transform = transforms.ToImageTensor() - transform(inpt) - if inpt_type in (features.BoundingBox, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt, copy=transform.copy) - - -class TestToImagePIL: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch("torchvision.prototype.transforms.functional.to_image_pil") - - inpt = mocker.MagicMock(spec=inpt_type) - transform = transforms.ToImagePIL() - transform(inpt) - if inpt_type in (features.BoundingBox, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt, mode=transform.mode) - - -class TestToPILImage: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch("torchvision.transforms.functional.to_pil_image") - - inpt = mocker.MagicMock(spec=inpt_type) - with pytest.warns(UserWarning, match="deprecated and will be removed"): - transform = transforms.ToPILImage() - transform(inpt) - if inpt_type in (PIL.Image.Image, features.BoundingBox, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt, mode=transform.mode) - - -class TestToTensor: - @pytest.mark.parametrize( - "inpt_type", - [torch.Tensor, PIL.Image.Image, features.Image, np.ndarray, features.BoundingBox, str, int], - ) - def test__transform(self, inpt_type, mocker): - fn = mocker.patch("torchvision.transforms.functional.to_tensor") - - inpt = mocker.MagicMock(spec=inpt_type) - with pytest.warns(UserWarning, match="deprecated and will be removed"): - transform = transforms.ToTensor() - transform(inpt) - if inpt_type in (features.Image, torch.Tensor, features.BoundingBox, str, int): - assert fn.call_count == 0 - else: - fn.assert_called_once_with(inpt) - - -class TestCompose: - def test_assertions(self): - with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"): - transforms.Compose(123) - - @pytest.mark.parametrize( - "trfms", - [ - [transforms.Pad(2), transforms.RandomCrop(28)], - [lambda x: 2.0 * x], - ], - ) - def test_ctor(self, trfms): - c = transforms.Compose(trfms) - inpt = torch.rand(1, 3, 32, 32) - output = c(inpt) - assert isinstance(output, torch.Tensor) - - -class TestRandomIoUCrop: - @pytest.mark.parametrize("device", cpu_and_gpu()) - @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]]) - def test__get_params(self, device, options, mocker): - image = mocker.MagicMock(spec=features.Image) - image.num_channels = 3 - image.image_size = (24, 32) - bboxes = features.BoundingBox( - torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), - format="XYXY", - image_size=image.image_size, - device=device, - ) - sample = [image, bboxes] - - transform = transforms.RandomIoUCrop(sampler_options=options) - - n_samples = 5 - for _ in range(n_samples): - - params = transform._get_params(sample) - - if options == [2.0]: - assert len(params) == 0 - return - - assert len(params["is_within_crop_area"]) > 0 - assert params["is_within_crop_area"].dtype == torch.bool - - orig_h = image.image_size[0] - orig_w = image.image_size[1] - assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h) - assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w) - - left, top = params["left"], params["top"] - new_h, new_w = params["height"], params["width"] - ious = box_iou( - bboxes, - torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device), - ) - assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}" - - def test__transform_empty_params(self, mocker): - transform = transforms.RandomIoUCrop(sampler_options=[2.0]) - image = features.Image(torch.rand(1, 3, 4, 4)) - bboxes = features.BoundingBox(torch.tensor([[1, 1, 2, 2]]), format="XYXY", image_size=(4, 4)) - label = features.Label(torch.tensor([1])) - sample = [image, bboxes, label] - # Let's mock transform._get_params to control the output: - transform._get_params = mocker.MagicMock(return_value={}) - output = transform(sample) - torch.testing.assert_close(output, sample) - - def test_forward_assertion(self): - transform = transforms.RandomIoUCrop() - with pytest.raises( - TypeError, - match="requires input sample to contain Images or PIL Images, BoundingBoxes and Labels or OneHotLabels", - ): - transform(torch.tensor(0)) - - def test__transform(self, mocker): - transform = transforms.RandomIoUCrop() - - image = features.Image(torch.rand(3, 32, 24)) - bboxes = make_bounding_box(format="XYXY", image_size=(32, 24), extra_dims=(6,)) - label = features.Label(torch.randint(0, 10, size=(6,))) - ohe_label = features.OneHotLabel(torch.zeros(6, 10).scatter_(1, label.unsqueeze(1), 1)) - masks = make_segmentation_mask((32, 24)) - ohe_masks = features.SegmentationMask(torch.randint(0, 2, size=(6, 32, 24))) - sample = [image, bboxes, label, ohe_label, masks, ohe_masks] - - fn = mocker.patch("torchvision.prototype.transforms.functional.crop", side_effect=lambda x, **params: x) - is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool) - - params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area) - transform._get_params = mocker.MagicMock(return_value=params) - output = transform(sample) - - assert fn.call_count == 4 - - expected_calls = [ - mocker.call(image, top=params["top"], left=params["left"], height=params["height"], width=params["width"]), - mocker.call(bboxes, top=params["top"], left=params["left"], height=params["height"], width=params["width"]), - mocker.call(masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"]), - mocker.call( - ohe_masks, top=params["top"], left=params["left"], height=params["height"], width=params["width"] - ), - ] - - fn.assert_has_calls(expected_calls) - - expected_within_targets = sum(is_within_crop_area) - - # check number of bboxes vs number of labels: - output_bboxes = output[1] - assert isinstance(output_bboxes, features.BoundingBox) - assert len(output_bboxes) == expected_within_targets - - # check labels - output_label = output[2] - assert isinstance(output_label, features.Label) - assert len(output_label) == expected_within_targets - torch.testing.assert_close(output_label, label[is_within_crop_area]) - - output_ohe_label = output[3] - assert isinstance(output_ohe_label, features.OneHotLabel) - torch.testing.assert_close(output_ohe_label, ohe_label[is_within_crop_area]) - - output_masks = output[4] - assert isinstance(output_masks, features.SegmentationMask) - assert output_masks.shape[:-2] == masks.shape[:-2] - - output_ohe_masks = output[5] - assert isinstance(output_ohe_masks, features.SegmentationMask) - assert len(output_ohe_masks) == expected_within_targets - - -class TestScaleJitter: - def test__get_params(self, mocker): - image_size = (24, 32) - target_size = (16, 12) - scale_range = (0.5, 1.5) - - transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) - - sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size) - params = transform._get_params(sample) - - assert "size" in params - size = params["size"] - - assert isinstance(size, tuple) and len(size) == 2 - height, width = size - - assert int(target_size[0] * scale_range[0]) <= height <= int(target_size[0] * scale_range[1]) - assert int(target_size[1] * scale_range[0]) <= width <= int(target_size[1] * scale_range[1]) - - def test__transform(self, mocker): - interpolation_sentinel = mocker.MagicMock() - - transform = transforms.ScaleJitter(target_size=(16, 12), interpolation=interpolation_sentinel) - transform._transformed_types = (mocker.MagicMock,) - - size_sentinel = mocker.MagicMock() - mocker.patch( - "torchvision.prototype.transforms._geometry.ScaleJitter._get_params", return_value=dict(size=size_sentinel) - ) - - inpt_sentinel = mocker.MagicMock() - - mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize") - transform(inpt_sentinel) - - mock.assert_called_once_with(inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel) - - -class TestRandomShortestSize: - def test__get_params(self, mocker): - image_size = (3, 10) - min_size = [5, 9] - max_size = 20 - - transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size) - - sample = mocker.MagicMock(spec=features.Image, num_channels=3, image_size=image_size) - params = transform._get_params(sample) - - assert "size" in params - size = params["size"] - - assert isinstance(size, tuple) and len(size) == 2 - - longer = max(size) - assert longer <= max_size - - shorter = min(size) - if longer == max_size: - assert shorter <= max_size - else: - assert shorter in min_size - - def test__transform(self, mocker): - interpolation_sentinel = mocker.MagicMock() - - transform = transforms.RandomShortestSize(min_size=[3, 5, 7], max_size=12, interpolation=interpolation_sentinel) - transform._transformed_types = (mocker.MagicMock,) - - size_sentinel = mocker.MagicMock() - mocker.patch( - "torchvision.prototype.transforms._geometry.RandomShortestSize._get_params", - return_value=dict(size=size_sentinel), - ) - - inpt_sentinel = mocker.MagicMock() - - mock = mocker.patch("torchvision.prototype.transforms._geometry.F.resize") - transform(inpt_sentinel) - - mock.assert_called_once_with(inpt_sentinel, size=size_sentinel, interpolation=interpolation_sentinel) +def make_label(*, extra_dims=(), categories=10, dtype=torch.int64, device="cpu"): + categories, num_categories = _parse_categories(categories) + # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values, + # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123 + data = torch.testing.make_tensor(extra_dims, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype) + return tv_tensors.Label(data, categories=categories) class TestSimpleCopyPaste: @@ -1324,21 +50,22 @@ def test__extract_image_targets_assertion(self, mocker): flat_sample = [ # images, batch size = 2 - self.create_fake_image(mocker, features.Image), + self.create_fake_image(mocker, Image), # labels, bboxes, masks - mocker.MagicMock(spec=features.Label), - mocker.MagicMock(spec=features.BoundingBox), - mocker.MagicMock(spec=features.SegmentationMask), + mocker.MagicMock(spec=tv_tensors.Label), + mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=Mask), # labels, bboxes, masks - mocker.MagicMock(spec=features.BoundingBox), - mocker.MagicMock(spec=features.SegmentationMask), + mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=Mask), ] - with pytest.raises(TypeError, match="requires input sample to contain equal-sized list of Images"): + with pytest.raises(TypeError, match="requires input sample to contain equal sized list of Images"): transform._extract_image_targets(flat_sample) - @pytest.mark.parametrize("image_type", [features.Image, PIL.Image.Image, torch.Tensor]) - def test__extract_image_targets(self, image_type, mocker): + @pytest.mark.parametrize("image_type", [Image, PIL.Image.Image, torch.Tensor]) + @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel]) + def test__extract_image_targets(self, image_type, label_type, mocker): transform = transforms.SimpleCopyPaste() flat_sample = [ @@ -1346,13 +73,13 @@ def test__extract_image_targets(self, image_type, mocker): self.create_fake_image(mocker, image_type), self.create_fake_image(mocker, image_type), # labels, bboxes, masks - mocker.MagicMock(spec=features.Label), - mocker.MagicMock(spec=features.BoundingBox), - mocker.MagicMock(spec=features.SegmentationMask), + mocker.MagicMock(spec=label_type), + mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=Mask), # labels, bboxes, masks - mocker.MagicMock(spec=features.Label), - mocker.MagicMock(spec=features.BoundingBox), - mocker.MagicMock(spec=features.SegmentationMask), + mocker.MagicMock(spec=label_type), + mocker.MagicMock(spec=BoundingBoxes), + mocker.MagicMock(spec=Mask), ] images, targets = transform._extract_image_targets(flat_sample) @@ -1365,60 +92,85 @@ def test__extract_image_targets(self, image_type, mocker): assert images[0] == flat_sample[0] assert images[1] == flat_sample[1] - def test__copy_paste(self): + for target in targets: + for key, type_ in [ + ("boxes", BoundingBoxes), + ("masks", Mask), + ("labels", label_type), + ]: + assert key in target + assert isinstance(target[key], type_) + assert target[key] in flat_sample + + @pytest.mark.parametrize("label_type", [tv_tensors.Label, tv_tensors.OneHotLabel]) + def test__copy_paste(self, label_type): image = 2 * torch.ones(3, 32, 32) masks = torch.zeros(2, 32, 32) masks[0, 3:9, 2:8] = 1 masks[1, 20:30, 20:30] = 1 + labels = torch.tensor([1, 2]) + blending = True + resize_interpolation = InterpolationMode.BILINEAR + antialias = None + if label_type == tv_tensors.OneHotLabel: + labels = torch.nn.functional.one_hot(labels, num_classes=5) target = { - "boxes": features.BoundingBox( - torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", image_size=(32, 32) + "boxes": BoundingBoxes( + torch.tensor([[2.0, 3.0, 8.0, 9.0], [20.0, 20.0, 30.0, 30.0]]), format="XYXY", canvas_size=(32, 32) ), - "masks": features.SegmentationMask(masks), - "labels": features.Label(torch.tensor([1, 2])), + "masks": Mask(masks), + "labels": label_type(labels), } paste_image = 10 * torch.ones(3, 32, 32) paste_masks = torch.zeros(2, 32, 32) paste_masks[0, 13:19, 12:18] = 1 paste_masks[1, 15:19, 1:8] = 1 + paste_labels = torch.tensor([3, 4]) + if label_type == tv_tensors.OneHotLabel: + paste_labels = torch.nn.functional.one_hot(paste_labels, num_classes=5) paste_target = { - "boxes": features.BoundingBox( - torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", image_size=(32, 32) + "boxes": BoundingBoxes( + torch.tensor([[12.0, 13.0, 19.0, 18.0], [1.0, 15.0, 8.0, 19.0]]), format="XYXY", canvas_size=(32, 32) ), - "masks": features.SegmentationMask(paste_masks), - "labels": features.Label(torch.tensor([3, 4])), + "masks": Mask(paste_masks), + "labels": label_type(paste_labels), } transform = transforms.SimpleCopyPaste() random_selection = torch.tensor([0, 1]) - output_image, output_target = transform._copy_paste(image, target, paste_image, paste_target, random_selection) + output_image, output_target = transform._copy_paste( + image, target, paste_image, paste_target, random_selection, blending, resize_interpolation, antialias + ) assert output_image.unique().tolist() == [2, 10] assert output_target["boxes"].shape == (4, 4) torch.testing.assert_close(output_target["boxes"][:2, :], target["boxes"]) torch.testing.assert_close(output_target["boxes"][2:, :], paste_target["boxes"]) - torch.testing.assert_close(output_target["labels"], features.Label(torch.tensor([1, 2, 3, 4]))) + + expected_labels = torch.tensor([1, 2, 3, 4]) + if label_type == tv_tensors.OneHotLabel: + expected_labels = torch.nn.functional.one_hot(expected_labels, num_classes=5) + torch.testing.assert_close(output_target["labels"], label_type(expected_labels)) + assert output_target["masks"].shape == (4, 32, 32) torch.testing.assert_close(output_target["masks"][:2, :], target["masks"]) torch.testing.assert_close(output_target["masks"][2:, :], paste_target["masks"]) class TestFixedSizeCrop: - def test__get_params(self, mocker): + def test_make_params(self, mocker): crop_size = (7, 7) batch_shape = (10,) - image_size = (11, 5) + canvas_size = (11, 5) transform = transforms.FixedSizeCrop(size=crop_size) - sample = dict( - image=make_image(size=image_size, color_space=features.ColorSpace.RGB), - bounding_boxes=make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=batch_shape - ), - ) - params = transform._get_params(sample) + flat_inputs = [ + make_image(size=canvas_size, color_space="RGB"), + make_bounding_boxes(format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_shape[0]), + ] + params = transform.make_params(flat_inputs) assert params["needs_crop"] assert params["height"] <= crop_size[0] @@ -1433,163 +185,245 @@ def test__get_params(self, mocker): assert params["needs_pad"] assert any(pad > 0 for pad in params["padding"]) - @pytest.mark.parametrize("needs", list(itertools.product((False, True), repeat=2))) - def test__transform(self, mocker, needs): - fill_sentinel = mocker.MagicMock() - padding_mode_sentinel = mocker.MagicMock() - - transform = transforms.FixedSizeCrop((-1, -1), fill=fill_sentinel, padding_mode=padding_mode_sentinel) - transform._transformed_types = (mocker.MagicMock,) - mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True) - mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True) - - needs_crop, needs_pad = needs - top_sentinel = mocker.MagicMock() - left_sentinel = mocker.MagicMock() - height_sentinel = mocker.MagicMock() - width_sentinel = mocker.MagicMock() - padding_sentinel = mocker.MagicMock() - mocker.patch( - "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params", - return_value=dict( - needs_crop=needs_crop, - top=top_sentinel, - left=left_sentinel, - height=height_sentinel, - width=width_sentinel, - padding=padding_sentinel, - needs_pad=needs_pad, - ), - ) - - inpt_sentinel = mocker.MagicMock() - - mock_crop = mocker.patch("torchvision.prototype.transforms._geometry.F.crop") - mock_pad = mocker.patch("torchvision.prototype.transforms._geometry.F.pad") - transform(inpt_sentinel) - - if needs_crop: - mock_crop.assert_called_once_with( - inpt_sentinel, - top=top_sentinel, - left=left_sentinel, - height=height_sentinel, - width=width_sentinel, - ) - else: - mock_crop.assert_not_called() - - if needs_pad: - # If we cropped before, the input to F.pad is no longer inpt_sentinel. Thus, we can't use - # `MagicMock.assert_called_once_with` and have to perform the checks manually - mock_pad.assert_called_once() - args, kwargs = mock_pad.call_args - if not needs_crop: - assert args[0] is inpt_sentinel - assert args[1] is padding_sentinel - assert kwargs == dict(fill=fill_sentinel, padding_mode=padding_mode_sentinel) - else: - mock_pad.assert_not_called() - def test__transform_culling(self, mocker): batch_size = 10 - image_size = (10, 10) + canvas_size = (10, 10) is_valid = torch.randint(0, 2, (batch_size,), dtype=torch.bool) mocker.patch( - "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params", + "torchvision.prototype.transforms._geometry.FixedSizeCrop.make_params", return_value=dict( needs_crop=True, top=0, left=0, - height=image_size[0], - width=image_size[1], + height=canvas_size[0], + width=canvas_size[1], is_valid=is_valid, needs_pad=False, ), ) - bounding_boxes = make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,) + bounding_boxes = make_bounding_boxes( + format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_size ) - segmentation_masks = make_segmentation_mask(size=image_size, extra_dims=(batch_size,)) - labels = make_label(size=(batch_size,)) + masks = make_detection_masks(size=canvas_size, num_masks=batch_size) + labels = make_label(extra_dims=(batch_size,)) transform = transforms.FixedSizeCrop((-1, -1)) - mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True) mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True) output = transform( dict( bounding_boxes=bounding_boxes, - segmentation_masks=segmentation_masks, + masks=masks, labels=labels, ) ) assert_equal(output["bounding_boxes"], bounding_boxes[is_valid]) - assert_equal(output["segmentation_masks"], segmentation_masks[is_valid]) + assert_equal(output["masks"], masks[is_valid]) assert_equal(output["labels"], labels[is_valid]) - def test__transform_bounding_box_clamping(self, mocker): + def test__transform_bounding_boxes_clamping(self, mocker): batch_size = 3 - image_size = (10, 10) + canvas_size = (10, 10) mocker.patch( - "torchvision.prototype.transforms._geometry.FixedSizeCrop._get_params", + "torchvision.prototype.transforms._geometry.FixedSizeCrop.make_params", return_value=dict( needs_crop=True, top=0, left=0, - height=image_size[0], - width=image_size[1], + height=canvas_size[0], + width=canvas_size[1], is_valid=torch.full((batch_size,), fill_value=True), needs_pad=False, ), ) - bounding_box = make_bounding_box( - format=features.BoundingBoxFormat.XYXY, image_size=image_size, extra_dims=(batch_size,) + bounding_boxes = make_bounding_boxes( + format=BoundingBoxFormat.XYXY, canvas_size=canvas_size, num_boxes=batch_size + ) + mock = mocker.patch( + "torchvision.prototype.transforms._geometry.F.clamp_bounding_boxes", wraps=clamp_bounding_boxes ) - mock = mocker.patch("torchvision.prototype.transforms._geometry.F.clamp_bounding_box") transform = transforms.FixedSizeCrop((-1, -1)) - mocker.patch("torchvision.prototype.transforms._geometry.has_all", return_value=True) mocker.patch("torchvision.prototype.transforms._geometry.has_any", return_value=True) - transform(bounding_box) + transform(bounding_boxes) mock.assert_called_once() -class TestLinearTransformation: - def test_assertions(self): - with pytest.raises(ValueError, match="transformation_matrix should be square"): - transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5)) +class TestLabelToOneHot: + def test__transform(self): + categories = ["apple", "pear", "pineapple"] + labels = tv_tensors.Label(torch.tensor([0, 1, 2, 1]), categories=categories) + transform = transforms.LabelToOneHot() + ohe_labels = transform(labels) + assert isinstance(ohe_labels, tv_tensors.OneHotLabel) + assert ohe_labels.shape == (4, 3) + assert ohe_labels.categories == labels.categories == categories - with pytest.raises(ValueError, match="mean_vector should have the same length"): - transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5)) +class TestPermuteDimensions: @pytest.mark.parametrize( - "inpt", + ("dims", "inverse_dims"), [ - 122 * torch.ones(1, 3, 8, 8), - 122.0 * torch.ones(1, 3, 8, 8), - features.Image(122 * torch.ones(1, 3, 8, 8)), - PIL.Image.new("RGB", (8, 8), (122, 122, 122)), + ( + {Image: (2, 1, 0), Video: None}, + {Image: (2, 1, 0), Video: None}, + ), + ( + {Image: (2, 1, 0), Video: (1, 2, 3, 0)}, + {Image: (2, 1, 0), Video: (3, 0, 1, 2)}, + ), ], ) - def test__transform(self, inpt): + def test_call(self, dims, inverse_dims): + sample = dict( + image=make_image(), + bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY), + video=make_video(), + str="str", + int=0, + ) - v = 121 * torch.ones(3 * 8 * 8) - m = torch.ones(3 * 8 * 8, 3 * 8 * 8) - transform = transforms.LinearTransformation(m, v) + transform = transforms.PermuteDimensions(dims) + transformed_sample = transform(sample) - if isinstance(inpt, PIL.Image.Image): - with pytest.raises(TypeError, match="Unsupported input type"): - transform(inpt) - else: - output = transform(inpt) - assert isinstance(output, torch.Tensor) - assert output.unique() == 3 * 8 * 8 - assert output.dtype == inpt.dtype + for key, value in sample.items(): + value_type = type(value) + transformed_value = transformed_sample[key] + + if check_type(value, (Image, is_pure_tensor, Video)): + if transform.dims.get(value_type) is not None: + assert transformed_value.permute(inverse_dims[value_type]).equal(value) + assert type(transformed_value) == torch.Tensor + else: + assert transformed_value is value + + @pytest.mark.filterwarnings("error") + def test_plain_tensor_call(self): + tensor = torch.empty((2, 3, 4)) + transform = transforms.PermuteDimensions(dims=(1, 2, 0)) + + assert transform(tensor).shape == (3, 4, 2) + + @pytest.mark.parametrize("other_type", [Image, Video]) + def test_plain_tensor_warning(self, other_type): + with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")): + transforms.PermuteDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)}) + + +class TestTransposeDimensions: + @pytest.mark.parametrize( + "dims", + [ + (-1, -2), + {Image: (1, 2), Video: None}, + ], + ) + def test_call(self, dims): + sample = dict( + image=make_image(), + bounding_boxes=make_bounding_boxes(format=BoundingBoxFormat.XYXY), + video=make_video(), + str="str", + int=0, + ) + + transform = transforms.TransposeDimensions(dims) + transformed_sample = transform(sample) + + for key, value in sample.items(): + value_type = type(value) + transformed_value = transformed_sample[key] + + transposed_dims = transform.dims.get(value_type) + if check_type(value, (Image, is_pure_tensor, Video)): + if transposed_dims is not None: + assert transformed_value.transpose(*transposed_dims).equal(value) + assert type(transformed_value) == torch.Tensor + else: + assert transformed_value is value + + @pytest.mark.filterwarnings("error") + def test_plain_tensor_call(self): + tensor = torch.empty((2, 3, 4)) + transform = transforms.TransposeDimensions(dims=(0, 2)) + + assert transform(tensor).shape == (4, 3, 2) + + @pytest.mark.parametrize("other_type", [Image, Video]) + def test_plain_tensor_warning(self, other_type): + with pytest.warns(UserWarning, match=re.escape("`torch.Tensor` will *not* be transformed")): + transforms.TransposeDimensions(dims={torch.Tensor: (0, 1), other_type: (1, 0)}) + + +import importlib.machinery +import importlib.util +from pathlib import Path + + +def import_transforms_from_references(reference): + HERE = Path(__file__).parent + PROJECT_ROOT = HERE.parent + + loader = importlib.machinery.SourceFileLoader( + "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py") + ) + spec = importlib.util.spec_from_loader("transforms", loader) + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +det_transforms = import_transforms_from_references("detection") + + +def test_fixed_sized_crop_against_detection_reference(): + def make_tv_tensors(): + size = (600, 800) + num_objects = 22 + + pil_image = to_pil_image(make_image(size=size, color_space="RGB")) + target = { + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float), + "labels": make_label(extra_dims=(num_objects,), categories=80), + "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long), + } + + yield (pil_image, target) + + tensor_image = torch.Tensor(make_image(size=size, color_space="RGB")) + target = { + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float), + "labels": make_label(extra_dims=(num_objects,), categories=80), + "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long), + } + + yield (tensor_image, target) + + tv_tensor_image = make_image(size=size, color_space="RGB") + target = { + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", num_boxes=num_objects, dtype=torch.float), + "labels": make_label(extra_dims=(num_objects,), categories=80), + "masks": make_detection_masks(size=size, num_masks=num_objects, dtype=torch.long), + } + + yield (tv_tensor_image, target) + + t = transforms.FixedSizeCrop((1024, 1024), fill=0) + t_ref = det_transforms.FixedSizeCrop((1024, 1024), fill=0) + + for dp in make_tv_tensors(): + # We should use prototype transform first as reference transform performs inplace target update + torch.manual_seed(12) + output = t(dp) + + torch.manual_seed(12) + expected_output = t_ref(*dp) + + assert_equal(expected_output, output) diff --git a/test/test_prototype_transforms_functional.py b/test/test_prototype_transforms_functional.py deleted file mode 100644 index cd11eb2a35e..00000000000 --- a/test/test_prototype_transforms_functional.py +++ /dev/null @@ -1,1910 +0,0 @@ -import functools -import itertools -import math -import os - -import numpy as np -import PIL.Image -import pytest -import torch.testing -import torchvision.prototype.transforms.functional as F -from common_utils import cpu_and_gpu -from torch import jit -from torch.nn.functional import one_hot -from torchvision.prototype import features -from torchvision.prototype.transforms.functional._geometry import _center_crop_compute_padding -from torchvision.prototype.transforms.functional._meta import convert_bounding_box_format -from torchvision.transforms.functional import _get_perspective_coeffs -from torchvision.transforms.functional_tensor import _max_value as get_max_value - -make_tensor = functools.partial(torch.testing.make_tensor, device="cpu") - - -def make_image(size=None, *, color_space, extra_dims=(), dtype=torch.float32, constant_alpha=True): - size = size or torch.randint(16, 33, (2,)).tolist() - - try: - num_channels = { - features.ColorSpace.GRAY: 1, - features.ColorSpace.GRAY_ALPHA: 2, - features.ColorSpace.RGB: 3, - features.ColorSpace.RGB_ALPHA: 4, - }[color_space] - except KeyError as error: - raise pytest.UsageError() from error - - shape = (*extra_dims, num_channels, *size) - max_value = get_max_value(dtype) - data = make_tensor(shape, low=0, high=max_value, dtype=dtype) - if color_space in {features.ColorSpace.GRAY_ALPHA, features.ColorSpace.RGB_ALPHA} and constant_alpha: - data[..., -1, :, :] = max_value - return features.Image(data, color_space=color_space) - - -make_grayscale_image = functools.partial(make_image, color_space=features.ColorSpace.GRAY) -make_rgb_image = functools.partial(make_image, color_space=features.ColorSpace.RGB) - - -def make_images( - sizes=((16, 16), (7, 33), (31, 9)), - color_spaces=( - features.ColorSpace.GRAY, - features.ColorSpace.GRAY_ALPHA, - features.ColorSpace.RGB, - features.ColorSpace.RGB_ALPHA, - ), - dtypes=(torch.float32, torch.uint8), - extra_dims=((4,), (2, 3)), -): - for size, color_space, dtype in itertools.product(sizes, color_spaces, dtypes): - yield make_image(size, color_space=color_space, dtype=dtype) - - for color_space, dtype, extra_dims_ in itertools.product(color_spaces, dtypes, extra_dims): - yield make_image(size=sizes[0], color_space=color_space, extra_dims=extra_dims_, dtype=dtype) - - -def randint_with_tensor_bounds(arg1, arg2=None, **kwargs): - low, high = torch.broadcast_tensors( - *[torch.as_tensor(arg) for arg in ((0, arg1) if arg2 is None else (arg1, arg2))] - ) - return torch.stack( - [ - torch.randint(low_scalar, high_scalar, (), **kwargs) - for low_scalar, high_scalar in zip(low.flatten().tolist(), high.flatten().tolist()) - ] - ).reshape(low.shape) - - -def make_bounding_box(*, format, image_size=(32, 32), extra_dims=(), dtype=torch.int64): - if isinstance(format, str): - format = features.BoundingBoxFormat[format] - - height, width = image_size - - if format == features.BoundingBoxFormat.XYXY: - x1 = torch.randint(0, width // 2, extra_dims) - y1 = torch.randint(0, height // 2, extra_dims) - x2 = randint_with_tensor_bounds(x1 + 1, width - x1) + x1 - y2 = randint_with_tensor_bounds(y1 + 1, height - y1) + y1 - parts = (x1, y1, x2, y2) - elif format == features.BoundingBoxFormat.XYWH: - x = torch.randint(0, width // 2, extra_dims) - y = torch.randint(0, height // 2, extra_dims) - w = randint_with_tensor_bounds(1, width - x) - h = randint_with_tensor_bounds(1, height - y) - parts = (x, y, w, h) - elif format == features.BoundingBoxFormat.CXCYWH: - cx = torch.randint(1, width - 1, ()) - cy = torch.randint(1, height - 1, ()) - w = randint_with_tensor_bounds(1, torch.minimum(cx, width - cx) + 1) - h = randint_with_tensor_bounds(1, torch.minimum(cy, height - cy) + 1) - parts = (cx, cy, w, h) - else: - raise pytest.UsageError() - - return features.BoundingBox(torch.stack(parts, dim=-1).to(dtype), format=format, image_size=image_size) - - -make_xyxy_bounding_box = functools.partial(make_bounding_box, format=features.BoundingBoxFormat.XYXY) - - -def make_bounding_boxes( - formats=(features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH), - image_sizes=((32, 32),), - dtypes=(torch.int64, torch.float32), - extra_dims=((4,), (2, 3)), -): - for format, image_size, dtype in itertools.product(formats, image_sizes, dtypes): - yield make_bounding_box(format=format, image_size=image_size, dtype=dtype) - - for format, extra_dims_ in itertools.product(formats, extra_dims): - yield make_bounding_box(format=format, extra_dims=extra_dims_) - - -def make_label(size=(), *, categories=("category0", "category1")): - return features.Label(torch.randint(0, len(categories) if categories else 10, size), categories=categories) - - -def make_one_hot_label(*args, **kwargs): - label = make_label(*args, **kwargs) - return features.OneHotLabel(one_hot(label, num_classes=len(label.categories)), categories=label.categories) - - -def make_one_hot_labels( - *, - num_categories=(1, 2, 10), - extra_dims=((4,), (2, 3)), -): - for num_categories_ in num_categories: - yield make_one_hot_label(categories=[f"category{idx}" for idx in range(num_categories_)]) - - for extra_dims_ in extra_dims: - yield make_one_hot_label(extra_dims_) - - -def make_segmentation_mask(size=None, *, num_categories=80, extra_dims=(), dtype=torch.long): - size = size or torch.randint(16, 33, (2,)).tolist() - shape = (*extra_dims, 1, *size) - data = make_tensor(shape, low=0, high=num_categories, dtype=dtype) - return features.SegmentationMask(data) - - -def make_segmentation_masks( - sizes=((16, 16), (7, 33), (31, 9)), - dtypes=(torch.long,), - extra_dims=((), (4,), (2, 3)), -): - for size, dtype, extra_dims_ in itertools.product(sizes, dtypes, extra_dims): - yield make_segmentation_mask(size=size, dtype=dtype, extra_dims=extra_dims_) - - -class SampleInput: - def __init__(self, *args, **kwargs): - self.args = args - self.kwargs = kwargs - - -class FunctionalInfo: - def __init__(self, name, *, sample_inputs_fn): - self.name = name - self.functional = getattr(F, name) - self._sample_inputs_fn = sample_inputs_fn - - def sample_inputs(self): - yield from self._sample_inputs_fn() - - def __call__(self, *args, **kwargs): - if len(args) == 1 and not kwargs and isinstance(args[0], SampleInput): - sample_input = args[0] - return self.functional(*sample_input.args, **sample_input.kwargs) - - return self.functional(*args, **kwargs) - - -FUNCTIONAL_INFOS = [] - - -def register_kernel_info_from_sample_inputs_fn(sample_inputs_fn): - FUNCTIONAL_INFOS.append(FunctionalInfo(sample_inputs_fn.__name__, sample_inputs_fn=sample_inputs_fn)) - return sample_inputs_fn - - -@register_kernel_info_from_sample_inputs_fn -def horizontal_flip_image_tensor(): - for image in make_images(): - yield SampleInput(image) - - -@register_kernel_info_from_sample_inputs_fn -def horizontal_flip_bounding_box(): - for bounding_box in make_bounding_boxes(formats=[features.BoundingBoxFormat.XYXY]): - yield SampleInput(bounding_box, format=bounding_box.format, image_size=bounding_box.image_size) - - -@register_kernel_info_from_sample_inputs_fn -def horizontal_flip_segmentation_mask(): - for mask in make_segmentation_masks(): - yield SampleInput(mask) - - -@register_kernel_info_from_sample_inputs_fn -def vertical_flip_image_tensor(): - for image in make_images(): - yield SampleInput(image) - - -@register_kernel_info_from_sample_inputs_fn -def vertical_flip_bounding_box(): - for bounding_box in make_bounding_boxes(formats=[features.BoundingBoxFormat.XYXY]): - yield SampleInput(bounding_box, format=bounding_box.format, image_size=bounding_box.image_size) - - -@register_kernel_info_from_sample_inputs_fn -def vertical_flip_segmentation_mask(): - for mask in make_segmentation_masks(): - yield SampleInput(mask) - - -@register_kernel_info_from_sample_inputs_fn -def resize_image_tensor(): - for image, interpolation, max_size, antialias in itertools.product( - make_images(), - [F.InterpolationMode.BILINEAR, F.InterpolationMode.NEAREST], # interpolation - [None, 34], # max_size - [False, True], # antialias - ): - - if antialias and interpolation == F.InterpolationMode.NEAREST: - continue - - height, width = image.shape[-2:] - for size in [ - (height, width), - (int(height * 0.75), int(width * 1.25)), - ]: - if max_size is not None: - size = [size[0]] - yield SampleInput(image, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias) - - -@register_kernel_info_from_sample_inputs_fn -def resize_bounding_box(): - for bounding_box, max_size in itertools.product( - make_bounding_boxes(), - [None, 34], # max_size - ): - height, width = bounding_box.image_size - for size in [ - (height, width), - (int(height * 0.75), int(width * 1.25)), - ]: - if max_size is not None: - size = [size[0]] - yield SampleInput(bounding_box, size=size, image_size=bounding_box.image_size) - - -@register_kernel_info_from_sample_inputs_fn -def resize_segmentation_mask(): - for mask, max_size in itertools.product( - make_segmentation_masks(), - [None, 34], # max_size - ): - height, width = mask.shape[-2:] - for size in [ - (height, width), - (int(height * 0.75), int(width * 1.25)), - ]: - if max_size is not None: - size = [size[0]] - yield SampleInput(mask, size=size, max_size=max_size) - - -@register_kernel_info_from_sample_inputs_fn -def affine_image_tensor(): - for image, angle, translate, scale, shear in itertools.product( - make_images(extra_dims=((), (4,))), - [-87, 15, 90], # angle - [5, -5], # translate - [0.77, 1.27], # scale - [0, 12], # shear - ): - yield SampleInput( - image, - angle=angle, - translate=(translate, translate), - scale=scale, - shear=(shear, shear), - interpolation=F.InterpolationMode.NEAREST, - ) - - -@register_kernel_info_from_sample_inputs_fn -def affine_bounding_box(): - for bounding_box, angle, translate, scale, shear in itertools.product( - make_bounding_boxes(), - [-87, 15, 90], # angle - [5, -5], # translate - [0.77, 1.27], # scale - [0, 12], # shear - ): - yield SampleInput( - bounding_box, - format=bounding_box.format, - image_size=bounding_box.image_size, - angle=angle, - translate=(translate, translate), - scale=scale, - shear=(shear, shear), - ) - - -@register_kernel_info_from_sample_inputs_fn -def affine_segmentation_mask(): - for mask, angle, translate, scale, shear in itertools.product( - make_segmentation_masks(extra_dims=((), (4,))), - [-87, 15, 90], # angle - [5, -5], # translate - [0.77, 1.27], # scale - [0, 12], # shear - ): - yield SampleInput( - mask, - angle=angle, - translate=(translate, translate), - scale=scale, - shear=(shear, shear), - ) - - -@register_kernel_info_from_sample_inputs_fn -def rotate_image_tensor(): - for image, angle, expand, center, fill in itertools.product( - make_images(extra_dims=((), (4,))), - [-87, 15, 90], # angle - [True, False], # expand - [None, [12, 23]], # center - [None, [128], [12.0]], # fill - ): - if center is not None and expand: - # Skip warning: The provided center argument is ignored if expand is True - continue - - yield SampleInput(image, angle=angle, expand=expand, center=center, fill=fill) - - -@register_kernel_info_from_sample_inputs_fn -def rotate_bounding_box(): - for bounding_box, angle, expand, center in itertools.product( - make_bounding_boxes(), [-87, 15, 90], [True, False], [None, [12, 23]] - ): - if center is not None and expand: - # Skip warning: The provided center argument is ignored if expand is True - continue - - yield SampleInput( - bounding_box, - format=bounding_box.format, - image_size=bounding_box.image_size, - angle=angle, - expand=expand, - center=center, - ) - - -@register_kernel_info_from_sample_inputs_fn -def rotate_segmentation_mask(): - for mask, angle, expand, center in itertools.product( - make_segmentation_masks(extra_dims=((), (4,))), - [-87, 15, 90], # angle - [True, False], # expand - [None, [12, 23]], # center - ): - if center is not None and expand: - # Skip warning: The provided center argument is ignored if expand is True - continue - - yield SampleInput( - mask, - angle=angle, - expand=expand, - center=center, - ) - - -@register_kernel_info_from_sample_inputs_fn -def crop_image_tensor(): - for image, top, left, height, width in itertools.product(make_images(), [-8, 0, 9], [-8, 0, 9], [12, 20], [12, 20]): - yield SampleInput( - image, - top=top, - left=left, - height=height, - width=width, - ) - - -@register_kernel_info_from_sample_inputs_fn -def crop_bounding_box(): - for bounding_box, top, left in itertools.product(make_bounding_boxes(), [-8, 0, 9], [-8, 0, 9]): - yield SampleInput( - bounding_box, - format=bounding_box.format, - top=top, - left=left, - ) - - -@register_kernel_info_from_sample_inputs_fn -def crop_segmentation_mask(): - for mask, top, left, height, width in itertools.product( - make_segmentation_masks(), [-8, 0, 9], [-8, 0, 9], [12, 20], [12, 20] - ): - yield SampleInput( - mask, - top=top, - left=left, - height=height, - width=width, - ) - - -@register_kernel_info_from_sample_inputs_fn -def resized_crop_image_tensor(): - for mask, top, left, height, width, size, antialias in itertools.product( - make_images(), - [-8, 9], - [-8, 9], - [12], - [12], - [(16, 18)], - [True, False], - ): - yield SampleInput(mask, top=top, left=left, height=height, width=width, size=size, antialias=antialias) - - -@register_kernel_info_from_sample_inputs_fn -def resized_crop_bounding_box(): - for bounding_box, top, left, height, width, size in itertools.product( - make_bounding_boxes(), [-8, 9], [-8, 9], [32, 22], [34, 20], [(32, 32), (16, 18)] - ): - yield SampleInput( - bounding_box, format=bounding_box.format, top=top, left=left, height=height, width=width, size=size - ) - - -@register_kernel_info_from_sample_inputs_fn -def resized_crop_segmentation_mask(): - for mask, top, left, height, width, size in itertools.product( - make_segmentation_masks(), [-8, 0, 9], [-8, 0, 9], [12, 20], [12, 20], [(32, 32), (16, 18)] - ): - yield SampleInput(mask, top=top, left=left, height=height, width=width, size=size) - - -@register_kernel_info_from_sample_inputs_fn -def pad_image_tensor(): - for image, padding, fill, padding_mode in itertools.product( - make_images(), - [[1], [1, 1], [1, 1, 2, 2]], # padding - [None, 12, 12.0], # fill - ["constant", "symmetric", "edge", "reflect"], # padding mode, - ): - yield SampleInput(image, padding=padding, fill=fill, padding_mode=padding_mode) - - -@register_kernel_info_from_sample_inputs_fn -def pad_segmentation_mask(): - for mask, padding, padding_mode in itertools.product( - make_segmentation_masks(), - [[1], [1, 1], [1, 1, 2, 2]], # padding - ["constant", "symmetric", "edge", "reflect"], # padding mode, - ): - yield SampleInput(mask, padding=padding, padding_mode=padding_mode) - - -@register_kernel_info_from_sample_inputs_fn -def pad_bounding_box(): - for bounding_box, padding in itertools.product( - make_bounding_boxes(), - [[1], [1, 1], [1, 1, 2, 2]], - ): - yield SampleInput(bounding_box, padding=padding, format=bounding_box.format) - - -@register_kernel_info_from_sample_inputs_fn -def perspective_image_tensor(): - for image, perspective_coeffs, fill in itertools.product( - make_images(extra_dims=((), (4,))), - [ - [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], - [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], - ], - [None, [128], [12.0]], # fill - ): - yield SampleInput(image, perspective_coeffs=perspective_coeffs, fill=fill) - - -@register_kernel_info_from_sample_inputs_fn -def perspective_bounding_box(): - for bounding_box, perspective_coeffs in itertools.product( - make_bounding_boxes(), - [ - [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], - [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], - ], - ): - yield SampleInput( - bounding_box, - format=bounding_box.format, - perspective_coeffs=perspective_coeffs, - ) - - -@register_kernel_info_from_sample_inputs_fn -def perspective_segmentation_mask(): - for mask, perspective_coeffs in itertools.product( - make_segmentation_masks(extra_dims=((), (4,))), - [ - [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], - [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], - ], - ): - yield SampleInput( - mask, - perspective_coeffs=perspective_coeffs, - ) - - -@register_kernel_info_from_sample_inputs_fn -def elastic_image_tensor(): - for image, fill in itertools.product( - make_images(extra_dims=((), (4,))), - [None, [128], [12.0]], # fill - ): - h, w = image.shape[-2:] - displacement = torch.rand(1, h, w, 2) - yield SampleInput(image, displacement=displacement, fill=fill) - - -@register_kernel_info_from_sample_inputs_fn -def elastic_bounding_box(): - for bounding_box in make_bounding_boxes(): - h, w = bounding_box.image_size - displacement = torch.rand(1, h, w, 2) - yield SampleInput( - bounding_box, - format=bounding_box.format, - displacement=displacement, - ) - - -@register_kernel_info_from_sample_inputs_fn -def elastic_segmentation_mask(): - for mask in make_segmentation_masks(extra_dims=((), (4,))): - h, w = mask.shape[-2:] - displacement = torch.rand(1, h, w, 2) - yield SampleInput( - mask, - displacement=displacement, - ) - - -@register_kernel_info_from_sample_inputs_fn -def center_crop_image_tensor(): - for mask, output_size in itertools.product( - make_images(sizes=((16, 16), (7, 33), (31, 9))), - [[4, 3], [42, 70], [4]], # crop sizes < image sizes, crop_sizes > image sizes, single crop size - ): - yield SampleInput(mask, output_size) - - -@register_kernel_info_from_sample_inputs_fn -def center_crop_bounding_box(): - for bounding_box, output_size in itertools.product(make_bounding_boxes(), [(24, 12), [16, 18], [46, 48], [12]]): - yield SampleInput( - bounding_box, format=bounding_box.format, output_size=output_size, image_size=bounding_box.image_size - ) - - -@register_kernel_info_from_sample_inputs_fn -def center_crop_segmentation_mask(): - for mask, output_size in itertools.product( - make_segmentation_masks(sizes=((16, 16), (7, 33), (31, 9))), - [[4, 3], [42, 70], [4]], # crop sizes < image sizes, crop_sizes > image sizes, single crop size - ): - yield SampleInput(mask, output_size) - - -@register_kernel_info_from_sample_inputs_fn -def gaussian_blur_image_tensor(): - for image, kernel_size, sigma in itertools.product( - make_images(extra_dims=((4,),)), - [[3, 3]], - [None, [3.0, 3.0]], - ): - yield SampleInput(image, kernel_size=kernel_size, sigma=sigma) - - -@register_kernel_info_from_sample_inputs_fn -def equalize_image_tensor(): - for image in make_images(extra_dims=(), color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)): - if image.dtype != torch.uint8: - continue - yield SampleInput(image) - - -@register_kernel_info_from_sample_inputs_fn -def invert_image_tensor(): - for image in make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)): - yield SampleInput(image) - - -@register_kernel_info_from_sample_inputs_fn -def posterize_image_tensor(): - for image, bits in itertools.product( - make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)), - [1, 4, 8], - ): - if image.dtype != torch.uint8: - continue - yield SampleInput(image, bits=bits) - - -@register_kernel_info_from_sample_inputs_fn -def solarize_image_tensor(): - for image, threshold in itertools.product( - make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)), - [0.1, 0.5, 127.0], - ): - if image.is_floating_point() and threshold > 1.0: - continue - yield SampleInput(image, threshold=threshold) - - -@register_kernel_info_from_sample_inputs_fn -def autocontrast_image_tensor(): - for image in make_images(color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)): - yield SampleInput(image) - - -@register_kernel_info_from_sample_inputs_fn -def adjust_sharpness_image_tensor(): - for image, sharpness_factor in itertools.product( - make_images(extra_dims=((4,),), color_spaces=(features.ColorSpace.GRAY, features.ColorSpace.RGB)), - [0.1, 0.5], - ): - yield SampleInput(image, sharpness_factor=sharpness_factor) - - -@register_kernel_info_from_sample_inputs_fn -def erase_image_tensor(): - for image in make_images(): - c = image.shape[-3] - yield SampleInput(image, i=1, j=2, h=6, w=7, v=torch.rand(c, 6, 7)) - - -@pytest.mark.parametrize( - "kernel", - [ - pytest.param(kernel, id=name) - for name, kernel in F.__dict__.items() - if not name.startswith("_") - and callable(kernel) - and any(feature_type in name for feature_type in {"image", "segmentation_mask", "bounding_box", "label"}) - and "pil" not in name - and name - not in { - "to_image_tensor", - } - ], -) -def test_scriptable(kernel): - jit.script(kernel) - - -# Test below is intended to test mid-level op vs low-level ops it calls -# For example, resize -> resize_image_tensor, resize_bounding_boxes etc -# TODO: Rewrite this tests as sample args may include more or less params -# than needed by functions -@pytest.mark.parametrize( - "func", - [ - pytest.param(func, id=name) - for name, func in F.__dict__.items() - if not name.startswith("_") - and callable(func) - and all( - feature_type not in name for feature_type in {"image", "segmentation_mask", "bounding_box", "label", "pil"} - ) - and name - not in { - "to_image_tensor", - "InterpolationMode", - "decode_video_with_av", - "crop", - "perspective", - "elastic_transform", - "elastic", - } - # We skip 'crop' due to missing 'height' and 'width' - # We skip 'perspective' as it requires different input args than perspective_image_tensor etc - # Skip 'elastic', TODO: inspect why test is failing - ], -) -def test_functional_mid_level(func): - finfos = [finfo for finfo in FUNCTIONAL_INFOS if f"{func.__name__}_" in finfo.name] - for finfo in finfos: - for sample_input in finfo.sample_inputs(): - expected = finfo(sample_input) - kwargs = dict(sample_input.kwargs) - for key in ["format", "image_size"]: - if key in kwargs: - del kwargs[key] - output = func(*sample_input.args, **kwargs) - torch.testing.assert_close( - output, expected, msg=f"finfo={finfo.name}, output={output}, expected={expected}" - ) - break - - -@pytest.mark.parametrize( - ("functional_info", "sample_input"), - [ - pytest.param(functional_info, sample_input, id=f"{functional_info.name}-{idx}") - for functional_info in FUNCTIONAL_INFOS - for idx, sample_input in enumerate(functional_info.sample_inputs()) - ], -) -def test_eager_vs_scripted(functional_info, sample_input): - eager = functional_info(sample_input) - scripted = jit.script(functional_info.functional)(*sample_input.args, **sample_input.kwargs) - - torch.testing.assert_close(eager, scripted) - - -def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): - rot = math.radians(angle_) - cx, cy = center_ - tx, ty = translate_ - sx, sy = [math.radians(sh_) for sh_ in shear_] - - c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]]) - t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]]) - c_matrix_inv = np.linalg.inv(c_matrix) - rs_matrix = np.array( - [ - [scale_ * math.cos(rot), -scale_ * math.sin(rot), 0], - [scale_ * math.sin(rot), scale_ * math.cos(rot), 0], - [0, 0, 1], - ] - ) - shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]]) - shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]]) - rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix)) - true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) - return true_matrix - - -@pytest.mark.parametrize("angle", range(-90, 90, 56)) -@pytest.mark.parametrize("translate", range(-10, 10, 8)) -@pytest.mark.parametrize("scale", [0.77, 1.0, 1.27]) -@pytest.mark.parametrize("shear", range(-15, 15, 8)) -@pytest.mark.parametrize("center", [None, (12, 14)]) -def test_correctness_affine_bounding_box(angle, translate, scale, shear, center): - def _compute_expected_bbox(bbox, angle_, translate_, scale_, shear_, center_): - affine_matrix = _compute_affine_matrix(angle_, translate_, scale_, shear_, center_) - affine_matrix = affine_matrix[:2, :] - - bbox_xyxy = convert_bounding_box_format( - bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY - ) - points = np.array( - [ - [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0], - ] - ) - transformed_points = np.matmul(points, affine_matrix.T) - out_bbox = [ - np.min(transformed_points[:, 0]), - np.min(transformed_points[:, 1]), - np.max(transformed_points[:, 0]), - np.max(transformed_points[:, 1]), - ] - out_bbox = features.BoundingBox( - out_bbox, - format=features.BoundingBoxFormat.XYXY, - image_size=bbox.image_size, - dtype=torch.float32, - device=bbox.device, - ) - return convert_bounding_box_format( - out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False - ) - - image_size = (32, 38) - - for bboxes in make_bounding_boxes( - image_sizes=[ - image_size, - ], - extra_dims=((4,),), - ): - bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size - - output_bboxes = F.affine_bounding_box( - bboxes, - bboxes_format, - image_size=bboxes_image_size, - angle=angle, - translate=(translate, translate), - scale=scale, - shear=(shear, shear), - center=center, - ) - - center_ = center - if center_ is None: - center_ = [s * 0.5 for s in bboxes_image_size[::-1]] - - if bboxes.ndim < 2: - bboxes = [bboxes] - - expected_bboxes = [] - for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bboxes.append( - _compute_expected_bbox(bbox, angle, (translate, translate), scale, (shear, shear), center_) - ) - if len(expected_bboxes) > 1: - expected_bboxes = torch.stack(expected_bboxes) - else: - expected_bboxes = expected_bboxes[0] - torch.testing.assert_close(output_bboxes, expected_bboxes) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_correctness_affine_bounding_box_on_fixed_input(device): - # Check transformation against known expected output - image_size = (64, 64) - # xyxy format - in_boxes = [ - [20, 25, 35, 45], - [50, 5, 70, 22], - [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10], - [1, 1, 5, 5], - ] - in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device - ) - # Tested parameters - angle = 63 - scale = 0.89 - dx = 0.12 - dy = 0.23 - - # Expected bboxes computed using albumentations: - # from albumentations.augmentations.geometric.functional import bbox_shift_scale_rotate - # from albumentations.augmentations.geometric.functional import normalize_bbox, denormalize_bbox - # expected_bboxes = [] - # for in_box in in_boxes: - # n_in_box = normalize_bbox(in_box, *image_size) - # n_out_box = bbox_shift_scale_rotate(n_in_box, -angle, scale, dx, dy, *image_size) - # out_box = denormalize_bbox(n_out_box, *image_size) - # expected_bboxes.append(out_box) - expected_bboxes = [ - (24.522435977922218, 34.375689508290854, 46.443125279998114, 54.3516575015695), - (54.88288587110401, 50.08453280875634, 76.44484547743795, 72.81332520036864), - (27.709526487041554, 34.74952648704156, 51.650473512958435, 58.69047351295844), - (48.56528888843238, 9.611532109828834, 53.35347829361575, 14.39972151501221), - ] - - output_boxes = F.affine_bounding_box( - in_boxes, - in_boxes.format, - in_boxes.image_size, - angle, - (dx * image_size[1], dy * image_size[0]), - scale, - shear=(0, 0), - ) - - torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) - - -@pytest.mark.parametrize("angle", [-54, 56]) -@pytest.mark.parametrize("translate", [-7, 8]) -@pytest.mark.parametrize("scale", [0.89, 1.12]) -@pytest.mark.parametrize("shear", [4]) -@pytest.mark.parametrize("center", [None, (12, 14)]) -def test_correctness_affine_segmentation_mask(angle, translate, scale, shear, center): - def _compute_expected_mask(mask, angle_, translate_, scale_, shear_, center_): - assert mask.ndim == 3 and mask.shape[0] == 1 - affine_matrix = _compute_affine_matrix(angle_, translate_, scale_, shear_, center_) - inv_affine_matrix = np.linalg.inv(affine_matrix) - inv_affine_matrix = inv_affine_matrix[:2, :] - - expected_mask = torch.zeros_like(mask.cpu()) - for out_y in range(expected_mask.shape[1]): - for out_x in range(expected_mask.shape[2]): - output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0]) - input_pt = np.floor(np.dot(inv_affine_matrix, output_pt)).astype(np.int32) - in_x, in_y = input_pt[:2] - if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]: - expected_mask[0, out_y, out_x] = mask[0, in_y, in_x] - return expected_mask.to(mask.device) - - for mask in make_segmentation_masks(extra_dims=((), (4,))): - output_mask = F.affine_segmentation_mask( - mask, - angle=angle, - translate=(translate, translate), - scale=scale, - shear=(shear, shear), - center=center, - ) - - center_ = center - if center_ is None: - center_ = [s * 0.5 for s in mask.shape[-2:][::-1]] - - if mask.ndim < 4: - masks = [mask] - else: - masks = [m for m in mask] - - expected_masks = [] - for mask in masks: - expected_mask = _compute_expected_mask(mask, angle, (translate, translate), scale, (shear, shear), center_) - expected_masks.append(expected_mask) - if len(expected_masks) > 1: - expected_masks = torch.stack(expected_masks) - else: - expected_masks = expected_masks[0] - torch.testing.assert_close(output_mask, expected_masks) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_correctness_affine_segmentation_mask_on_fixed_input(device): - # Check transformation against known expected output and CPU/CUDA devices - - # Create a fixed input segmentation mask with 2 square masks - # in top-left, bottom-left corners - mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device) - mask[0, 2:10, 2:10] = 1 - mask[0, 32 - 9 : 32 - 3, 3:9] = 2 - - # Rotate 90 degrees and scale - expected_mask = torch.rot90(mask, k=-1, dims=(-2, -1)) - expected_mask = torch.nn.functional.interpolate(expected_mask[None, :].float(), size=(64, 64), mode="nearest") - expected_mask = expected_mask[0, :, 16 : 64 - 16, 16 : 64 - 16].long() - - out_mask = F.affine_segmentation_mask(mask, 90, [0.0, 0.0], 64.0 / 32.0, [0.0, 0.0]) - - torch.testing.assert_close(out_mask, expected_mask) - - -@pytest.mark.parametrize("angle", range(-90, 90, 56)) -@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))]) -def test_correctness_rotate_bounding_box(angle, expand, center): - def _compute_expected_bbox(bbox, angle_, expand_, center_): - affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_) - affine_matrix = affine_matrix[:2, :] - - image_size = bbox.image_size - bbox_xyxy = convert_bounding_box_format( - bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY - ) - points = np.array( - [ - [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0], - # image frame - [0.0, 0.0, 1.0], - [0.0, image_size[0], 1.0], - [image_size[1], image_size[0], 1.0], - [image_size[1], 0.0, 1.0], - ] - ) - transformed_points = np.matmul(points, affine_matrix.T) - out_bbox = [ - np.min(transformed_points[:4, 0]), - np.min(transformed_points[:4, 1]), - np.max(transformed_points[:4, 0]), - np.max(transformed_points[:4, 1]), - ] - if expand_: - tr_x = np.min(transformed_points[4:, 0]) - tr_y = np.min(transformed_points[4:, 1]) - out_bbox[0] -= tr_x - out_bbox[1] -= tr_y - out_bbox[2] -= tr_x - out_bbox[3] -= tr_y - - # image_size should be updated, but it is OK here to skip its computation - # as we do not compute it in F.rotate_bounding_box - - out_bbox = features.BoundingBox( - out_bbox, - format=features.BoundingBoxFormat.XYXY, - image_size=image_size, - dtype=torch.float32, - device=bbox.device, - ) - return convert_bounding_box_format( - out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False - ) - - image_size = (32, 38) - - for bboxes in make_bounding_boxes( - image_sizes=[ - image_size, - ], - extra_dims=((4,),), - ): - bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size - - output_bboxes = F.rotate_bounding_box( - bboxes, - bboxes_format, - image_size=bboxes_image_size, - angle=angle, - expand=expand, - center=center, - ) - - center_ = center - if center_ is None: - center_ = [s * 0.5 for s in bboxes_image_size[::-1]] - - if bboxes.ndim < 2: - bboxes = [bboxes] - - expected_bboxes = [] - for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bboxes.append(_compute_expected_bbox(bbox, -angle, expand, center_)) - if len(expected_bboxes) > 1: - expected_bboxes = torch.stack(expected_bboxes) - else: - expected_bboxes = expected_bboxes[0] - torch.testing.assert_close(output_bboxes, expected_bboxes) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("expand", [False]) # expand=True does not match D2 -def test_correctness_rotate_bounding_box_on_fixed_input(device, expand): - # Check transformation against known expected output - image_size = (64, 64) - # xyxy format - in_boxes = [ - [1, 1, 5, 5], - [1, image_size[0] - 6, 5, image_size[0] - 2], - [image_size[1] - 6, image_size[0] - 6, image_size[1] - 2, image_size[0] - 2], - [image_size[1] // 2 - 10, image_size[0] // 2 - 10, image_size[1] // 2 + 10, image_size[0] // 2 + 10], - ] - in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, dtype=torch.float64, device=device - ) - # Tested parameters - angle = 45 - center = None if expand else [12, 23] - - # # Expected bboxes computed using Detectron2: - # from detectron2.data.transforms import RotationTransform, AugmentationList - # from detectron2.data.transforms import AugInput - # import cv2 - # inpt = AugInput(im1, boxes=np.array(in_boxes, dtype="float32")) - # augs = AugmentationList([RotationTransform(*size, angle, expand=expand, center=center, interp=cv2.INTER_NEAREST), ]) - # out = augs(inpt) - # print(inpt.boxes) - if expand: - expected_bboxes = [ - [1.65937957, 42.67157288, 7.31623382, 48.32842712], - [41.96446609, 82.9766594, 47.62132034, 88.63351365], - [82.26955262, 42.67157288, 87.92640687, 48.32842712], - [31.35786438, 31.35786438, 59.64213562, 59.64213562], - ] - else: - expected_bboxes = [ - [-11.33452378, 12.39339828, -5.67766953, 18.05025253], - [28.97056275, 52.69848481, 34.627417, 58.35533906], - [69.27564928, 12.39339828, 74.93250353, 18.05025253], - [18.36396103, 1.07968978, 46.64823228, 29.36396103], - ] - - output_boxes = F.rotate_bounding_box( - in_boxes, - in_boxes.format, - in_boxes.image_size, - angle, - expand=expand, - center=center, - ) - - torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) - - -@pytest.mark.parametrize("angle", range(-90, 90, 37)) -@pytest.mark.parametrize("expand, center", [(True, None), (False, None), (False, (12, 14))]) -def test_correctness_rotate_segmentation_mask(angle, expand, center): - def _compute_expected_mask(mask, angle_, expand_, center_): - assert mask.ndim == 3 and mask.shape[0] == 1 - image_size = mask.shape[-2:] - affine_matrix = _compute_affine_matrix(angle_, [0.0, 0.0], 1.0, [0.0, 0.0], center_) - inv_affine_matrix = np.linalg.inv(affine_matrix) - - if expand_: - # Pillow implementation on how to perform expand: - # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054-L2069 - height, width = image_size - points = np.array( - [ - [0.0, 0.0, 1.0], - [0.0, 1.0 * height, 1.0], - [1.0 * width, 1.0 * height, 1.0], - [1.0 * width, 0.0, 1.0], - ] - ) - new_points = points @ inv_affine_matrix.T - min_vals = np.min(new_points, axis=0)[:2] - max_vals = np.max(new_points, axis=0)[:2] - cmax = np.ceil(np.trunc(max_vals * 1e4) * 1e-4) - cmin = np.floor(np.trunc((min_vals + 1e-8) * 1e4) * 1e-4) - new_width, new_height = (cmax - cmin).astype("int32").tolist() - tr = np.array([-(new_width - width) / 2.0, -(new_height - height) / 2.0, 1.0]) @ inv_affine_matrix.T - - inv_affine_matrix[:2, 2] = tr[:2] - image_size = [new_height, new_width] - - inv_affine_matrix = inv_affine_matrix[:2, :] - expected_mask = torch.zeros(1, *image_size, dtype=mask.dtype) - - for out_y in range(expected_mask.shape[1]): - for out_x in range(expected_mask.shape[2]): - output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0]) - input_pt = np.floor(np.dot(inv_affine_matrix, output_pt)).astype(np.int32) - in_x, in_y = input_pt[:2] - if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]: - expected_mask[0, out_y, out_x] = mask[0, in_y, in_x] - return expected_mask.to(mask.device) - - for mask in make_segmentation_masks(extra_dims=((), (4,))): - output_mask = F.rotate_segmentation_mask( - mask, - angle=angle, - expand=expand, - center=center, - ) - - center_ = center - if center_ is None: - center_ = [s * 0.5 for s in mask.shape[-2:][::-1]] - - if mask.ndim < 4: - masks = [mask] - else: - masks = [m for m in mask] - - expected_masks = [] - for mask in masks: - expected_mask = _compute_expected_mask(mask, -angle, expand, center_) - expected_masks.append(expected_mask) - if len(expected_masks) > 1: - expected_masks = torch.stack(expected_masks) - else: - expected_masks = expected_masks[0] - torch.testing.assert_close(output_mask, expected_masks) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_correctness_rotate_segmentation_mask_on_fixed_input(device): - # Check transformation against known expected output and CPU/CUDA devices - - # Create a fixed input segmentation mask with 2 square masks - # in top-left, bottom-left corners - mask = torch.zeros(1, 32, 32, dtype=torch.long, device=device) - mask[0, 2:10, 2:10] = 1 - mask[0, 32 - 9 : 32 - 3, 3:9] = 2 - - # Rotate 90 degrees - expected_mask = torch.rot90(mask, k=1, dims=(-2, -1)) - out_mask = F.rotate_segmentation_mask(mask, 90, expand=False) - torch.testing.assert_close(out_mask, expected_mask) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "format", - [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH], -) -@pytest.mark.parametrize( - "top, left, height, width, expected_bboxes", - [ - [8, 12, 30, 40, [(-2.0, 7.0, 13.0, 27.0), (38.0, -3.0, 58.0, 14.0), (33.0, 38.0, 44.0, 54.0)]], - [-8, 12, 70, 40, [(-2.0, 23.0, 13.0, 43.0), (38.0, 13.0, 58.0, 30.0), (33.0, 54.0, 44.0, 70.0)]], - ], -) -def test_correctness_crop_bounding_box(device, format, top, left, height, width, expected_bboxes): - - # Expected bboxes computed using Albumentations: - # import numpy as np - # from albumentations.augmentations.crops.functional import crop_bbox_by_coords, normalize_bbox, denormalize_bbox - # expected_bboxes = [] - # for in_box in in_boxes: - # n_in_box = normalize_bbox(in_box, *size) - # n_out_box = crop_bbox_by_coords( - # n_in_box, (left, top, left + width, top + height), height, width, *size - # ) - # out_box = denormalize_bbox(n_out_box, height, width) - # expected_bboxes.append(out_box) - - size = (64, 76) - # xyxy format - in_boxes = [ - [10.0, 15.0, 25.0, 35.0], - [50.0, 5.0, 70.0, 22.0], - [45.0, 46.0, 56.0, 62.0], - ] - in_boxes = features.BoundingBox(in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=size, device=device) - if format != features.BoundingBoxFormat.XYXY: - in_boxes = convert_bounding_box_format(in_boxes, features.BoundingBoxFormat.XYXY, format) - - output_boxes = F.crop_bounding_box( - in_boxes, - format, - top, - left, - ) - - if format != features.BoundingBoxFormat.XYXY: - output_boxes = convert_bounding_box_format(output_boxes, format, features.BoundingBoxFormat.XYXY) - - torch.testing.assert_close(output_boxes.tolist(), expected_bboxes) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "top, left, height, width", - [ - [4, 6, 30, 40], - [-8, 6, 70, 40], - [-8, -6, 70, 8], - ], -) -def test_correctness_crop_segmentation_mask(device, top, left, height, width): - def _compute_expected_mask(mask, top_, left_, height_, width_): - h, w = mask.shape[-2], mask.shape[-1] - if top_ >= 0 and left_ >= 0 and top_ + height_ < h and left_ + width_ < w: - expected = mask[..., top_ : top_ + height_, left_ : left_ + width_] - else: - # Create output mask - expected_shape = mask.shape[:-2] + (height_, width_) - expected = torch.zeros(expected_shape, device=mask.device, dtype=mask.dtype) - - out_y1 = abs(top_) if top_ < 0 else 0 - out_y2 = h - top_ if top_ + height_ >= h else height_ - out_x1 = abs(left_) if left_ < 0 else 0 - out_x2 = w - left_ if left_ + width_ >= w else width_ - - in_y1 = 0 if top_ < 0 else top_ - in_y2 = h if top_ + height_ >= h else top_ + height_ - in_x1 = 0 if left_ < 0 else left_ - in_x2 = w if left_ + width_ >= w else left_ + width_ - # Paste input mask into output - expected[..., out_y1:out_y2, out_x1:out_x2] = mask[..., in_y1:in_y2, in_x1:in_x2] - - return expected - - for mask in make_segmentation_masks(): - if mask.device != torch.device(device): - mask = mask.to(device) - output_mask = F.crop_segmentation_mask(mask, top, left, height, width) - expected_mask = _compute_expected_mask(mask, top, left, height, width) - torch.testing.assert_close(output_mask, expected_mask) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_correctness_horizontal_flip_segmentation_mask_on_fixed_input(device): - mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) - mask[:, :, 0] = 1 - - out_mask = F.horizontal_flip_segmentation_mask(mask) - - expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) - expected_mask[:, :, -1] = 1 - torch.testing.assert_close(out_mask, expected_mask) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device): - mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) - mask[:, 0, :] = 1 - - out_mask = F.vertical_flip_segmentation_mask(mask) - - expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) - expected_mask[:, -1, :] = 1 - torch.testing.assert_close(out_mask, expected_mask) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "format", - [features.BoundingBoxFormat.XYXY, features.BoundingBoxFormat.XYWH, features.BoundingBoxFormat.CXCYWH], -) -@pytest.mark.parametrize( - "top, left, height, width, size", - [ - [0, 0, 30, 30, (60, 60)], - [-5, 5, 35, 45, (32, 34)], - ], -) -def test_correctness_resized_crop_bounding_box(device, format, top, left, height, width, size): - def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): - # bbox should be xyxy - bbox[0] = (bbox[0] - left_) * size_[1] / width_ - bbox[1] = (bbox[1] - top_) * size_[0] / height_ - bbox[2] = (bbox[2] - left_) * size_[1] / width_ - bbox[3] = (bbox[3] - top_) * size_[0] / height_ - return bbox - - image_size = (100, 100) - # xyxy format - in_boxes = [ - [10.0, 10.0, 20.0, 20.0], - [5.0, 10.0, 15.0, 20.0], - ] - expected_bboxes = [] - for in_box in in_boxes: - expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size)) - expected_bboxes = torch.tensor(expected_bboxes, device=device) - - in_boxes = features.BoundingBox( - in_boxes, format=features.BoundingBoxFormat.XYXY, image_size=image_size, device=device - ) - if format != features.BoundingBoxFormat.XYXY: - in_boxes = convert_bounding_box_format(in_boxes, features.BoundingBoxFormat.XYXY, format) - - output_boxes = F.resized_crop_bounding_box(in_boxes, format, top, left, height, width, size) - - if format != features.BoundingBoxFormat.XYXY: - output_boxes = convert_bounding_box_format(output_boxes, format, features.BoundingBoxFormat.XYXY) - - torch.testing.assert_close(output_boxes, expected_bboxes) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "top, left, height, width, size", - [ - [0, 0, 30, 30, (60, 60)], - [5, 5, 35, 45, (32, 34)], - ], -) -def test_correctness_resized_crop_segmentation_mask(device, top, left, height, width, size): - def _compute_expected_mask(mask, top_, left_, height_, width_, size_): - output = mask.clone() - output = output[:, top_ : top_ + height_, left_ : left_ + width_] - output = torch.nn.functional.interpolate(output[None, :].float(), size=size_, mode="nearest") - output = output[0, :].long() - return output - - in_mask = torch.zeros(1, 100, 100, dtype=torch.long, device=device) - in_mask[0, 10:20, 10:20] = 1 - in_mask[0, 5:15, 12:23] = 2 - - expected_mask = _compute_expected_mask(in_mask, top, left, height, width, size) - output_mask = F.resized_crop_segmentation_mask(in_mask, top, left, height, width, size) - torch.testing.assert_close(output_mask, expected_mask) - - -def _parse_padding(padding): - if isinstance(padding, int): - return [padding] * 4 - if isinstance(padding, list): - if len(padding) == 1: - return padding * 4 - if len(padding) == 2: - return padding * 2 # [left, up, right, down] - - return padding - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]]) -def test_correctness_pad_bounding_box(device, padding): - def _compute_expected_bbox(bbox, padding_): - pad_left, pad_up, _, _ = _parse_padding(padding_) - - bbox_format = bbox.format - bbox_dtype = bbox.dtype - bbox = convert_bounding_box_format(bbox, old_format=bbox_format, new_format=features.BoundingBoxFormat.XYXY) - - bbox[0::2] += pad_left - bbox[1::2] += pad_up - - bbox = convert_bounding_box_format( - bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox_format, copy=False - ) - if bbox.dtype != bbox_dtype: - # Temporary cast to original dtype - # e.g. float32 -> int - bbox = bbox.to(bbox_dtype) - return bbox - - for bboxes in make_bounding_boxes(): - bboxes = bboxes.to(device) - bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size - - output_boxes = F.pad_bounding_box(bboxes, padding, format=bboxes_format) - - if bboxes.ndim < 2: - bboxes = [bboxes] - - expected_bboxes = [] - for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bboxes.append(_compute_expected_bbox(bbox, padding)) - - if len(expected_bboxes) > 1: - expected_bboxes = torch.stack(expected_bboxes) - else: - expected_bboxes = expected_bboxes[0] - torch.testing.assert_close(output_boxes, expected_bboxes) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_correctness_pad_segmentation_mask_on_fixed_input(device): - mask = torch.ones((1, 3, 3), dtype=torch.long, device=device) - - out_mask = F.pad_segmentation_mask(mask, padding=[1, 1, 1, 1]) - - expected_mask = torch.zeros((1, 5, 5), dtype=torch.long, device=device) - expected_mask[:, 1:-1, 1:-1] = 1 - torch.testing.assert_close(out_mask, expected_mask) - - -@pytest.mark.parametrize("padding", [[1, 2, 3, 4], [1], 1, [1, 2]]) -@pytest.mark.parametrize("padding_mode", ["constant", "edge", "reflect", "symmetric"]) -def test_correctness_pad_segmentation_mask(padding, padding_mode): - def _compute_expected_mask(mask, padding_, padding_mode_): - h, w = mask.shape[-2], mask.shape[-1] - pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_) - - if any(pad <= 0 for pad in [pad_left, pad_up, pad_right, pad_down]): - raise pytest.UsageError( - "Expected output can be computed on positive pad values only, " - "but F.pad_* can also crop for negative values" - ) - - new_h = h + pad_up + pad_down - new_w = w + pad_left + pad_right - - new_shape = (*mask.shape[:-2], new_h, new_w) if len(mask.shape) > 2 else (new_h, new_w) - output = torch.zeros(new_shape, dtype=mask.dtype) - output[..., pad_up:-pad_down, pad_left:-pad_right] = mask - - if padding_mode_ == "edge": - # pad top-left corner, left vertical block, bottom-left corner - output[..., :pad_up, :pad_left] = mask[..., 0, 0].unsqueeze(-1).unsqueeze(-2) - output[..., pad_up:-pad_down, :pad_left] = mask[..., :, 0].unsqueeze(-1) - output[..., -pad_down:, :pad_left] = mask[..., -1, 0].unsqueeze(-1).unsqueeze(-2) - # pad top-right corner, right vertical block, bottom-right corner - output[..., :pad_up, -pad_right:] = mask[..., 0, -1].unsqueeze(-1).unsqueeze(-2) - output[..., pad_up:-pad_down, -pad_right:] = mask[..., :, -1].unsqueeze(-1) - output[..., -pad_down:, -pad_right:] = mask[..., -1, -1].unsqueeze(-1).unsqueeze(-2) - # pad top and bottom horizontal blocks - output[..., :pad_up, pad_left:-pad_right] = mask[..., 0, :].unsqueeze(-2) - output[..., -pad_down:, pad_left:-pad_right] = mask[..., -1, :].unsqueeze(-2) - elif padding_mode_ in ("reflect", "symmetric"): - d1 = 1 if padding_mode_ == "reflect" else 0 - d2 = -1 if padding_mode_ == "reflect" else None - both = (-1, -2) - # pad top-left corner, left vertical block, bottom-left corner - output[..., :pad_up, :pad_left] = mask[..., d1 : pad_up + d1, d1 : pad_left + d1].flip(both) - output[..., pad_up:-pad_down, :pad_left] = mask[..., :, d1 : pad_left + d1].flip(-1) - output[..., -pad_down:, :pad_left] = mask[..., -pad_down - d1 : d2, d1 : pad_left + d1].flip(both) - # pad top-right corner, right vertical block, bottom-right corner - output[..., :pad_up, -pad_right:] = mask[..., d1 : pad_up + d1, -pad_right - d1 : d2].flip(both) - output[..., pad_up:-pad_down, -pad_right:] = mask[..., :, -pad_right - d1 : d2].flip(-1) - output[..., -pad_down:, -pad_right:] = mask[..., -pad_down - d1 : d2, -pad_right - d1 : d2].flip(both) - # pad top and bottom horizontal blocks - output[..., :pad_up, pad_left:-pad_right] = mask[..., d1 : pad_up + d1, :].flip(-2) - output[..., -pad_down:, pad_left:-pad_right] = mask[..., -pad_down - d1 : d2, :].flip(-2) - - return output - - for mask in make_segmentation_masks(): - out_mask = F.pad_segmentation_mask(mask, padding, padding_mode=padding_mode) - - expected_mask = _compute_expected_mask(mask, padding, padding_mode) - torch.testing.assert_close(out_mask, expected_mask) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "startpoints, endpoints", - [ - [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]], - [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]], - [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]], - ], -) -def test_correctness_perspective_bounding_box(device, startpoints, endpoints): - def _compute_expected_bbox(bbox, pcoeffs_): - m1 = np.array( - [ - [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]], - [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]], - ] - ) - m2 = np.array( - [ - [pcoeffs_[6], pcoeffs_[7], 1.0], - [pcoeffs_[6], pcoeffs_[7], 1.0], - ] - ) - - bbox_xyxy = convert_bounding_box_format( - bbox, old_format=bbox.format, new_format=features.BoundingBoxFormat.XYXY - ) - points = np.array( - [ - [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0], - [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0], - [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0], - ] - ) - numer = np.matmul(points, m1.T) - denom = np.matmul(points, m2.T) - transformed_points = numer / denom - out_bbox = [ - np.min(transformed_points[:, 0]), - np.min(transformed_points[:, 1]), - np.max(transformed_points[:, 0]), - np.max(transformed_points[:, 1]), - ] - out_bbox = features.BoundingBox( - out_bbox, - format=features.BoundingBoxFormat.XYXY, - image_size=bbox.image_size, - dtype=torch.float32, - device=bbox.device, - ) - return convert_bounding_box_format( - out_bbox, old_format=features.BoundingBoxFormat.XYXY, new_format=bbox.format, copy=False - ) - - image_size = (32, 38) - - pcoeffs = _get_perspective_coeffs(startpoints, endpoints) - inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints) - - for bboxes in make_bounding_boxes( - image_sizes=[ - image_size, - ], - extra_dims=((4,),), - ): - bboxes = bboxes.to(device) - bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size - - output_bboxes = F.perspective_bounding_box( - bboxes, - bboxes_format, - perspective_coeffs=pcoeffs, - ) - - if bboxes.ndim < 2: - bboxes = [bboxes] - - expected_bboxes = [] - for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bboxes.append(_compute_expected_bbox(bbox, inv_pcoeffs)) - if len(expected_bboxes) > 1: - expected_bboxes = torch.stack(expected_bboxes) - else: - expected_bboxes = expected_bboxes[0] - torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=1e-5, atol=1e-5) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "startpoints, endpoints", - [ - [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]], - [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]], - [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]], - ], -) -def test_correctness_perspective_segmentation_mask(device, startpoints, endpoints): - def _compute_expected_mask(mask, pcoeffs_): - assert mask.ndim == 3 and mask.shape[0] == 1 - m1 = np.array( - [ - [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]], - [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]], - ] - ) - m2 = np.array( - [ - [pcoeffs_[6], pcoeffs_[7], 1.0], - [pcoeffs_[6], pcoeffs_[7], 1.0], - ] - ) - - expected_mask = torch.zeros_like(mask.cpu()) - for out_y in range(expected_mask.shape[1]): - for out_x in range(expected_mask.shape[2]): - output_pt = np.array([out_x + 0.5, out_y + 0.5, 1.0]) - - numer = np.matmul(output_pt, m1.T) - denom = np.matmul(output_pt, m2.T) - input_pt = np.floor(numer / denom).astype(np.int32) - - in_x, in_y = input_pt[:2] - if 0 <= in_x < mask.shape[2] and 0 <= in_y < mask.shape[1]: - expected_mask[0, out_y, out_x] = mask[0, in_y, in_x] - return expected_mask.to(mask.device) - - pcoeffs = _get_perspective_coeffs(startpoints, endpoints) - - for mask in make_segmentation_masks(extra_dims=((), (4,))): - mask = mask.to(device) - - output_mask = F.perspective_segmentation_mask( - mask, - perspective_coeffs=pcoeffs, - ) - - if mask.ndim < 4: - masks = [mask] - else: - masks = [m for m in mask] - - expected_masks = [] - for mask in masks: - expected_mask = _compute_expected_mask(mask, pcoeffs) - expected_masks.append(expected_mask) - if len(expected_masks) > 1: - expected_masks = torch.stack(expected_masks) - else: - expected_masks = expected_masks[0] - torch.testing.assert_close(output_mask, expected_masks) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "output_size", - [(18, 18), [18, 15], (16, 19), [12], [46, 48]], -) -def test_correctness_center_crop_bounding_box(device, output_size): - def _compute_expected_bbox(bbox, output_size_): - format_ = bbox.format - image_size_ = bbox.image_size - bbox = convert_bounding_box_format(bbox, format_, features.BoundingBoxFormat.XYWH) - - if len(output_size_) == 1: - output_size_.append(output_size_[-1]) - - cy = int(round((image_size_[0] - output_size_[0]) * 0.5)) - cx = int(round((image_size_[1] - output_size_[1]) * 0.5)) - out_bbox = [ - bbox[0].item() - cx, - bbox[1].item() - cy, - bbox[2].item(), - bbox[3].item(), - ] - out_bbox = features.BoundingBox( - out_bbox, - format=features.BoundingBoxFormat.XYWH, - image_size=output_size_, - dtype=bbox.dtype, - device=bbox.device, - ) - return convert_bounding_box_format(out_bbox, features.BoundingBoxFormat.XYWH, format_, copy=False) - - for bboxes in make_bounding_boxes( - image_sizes=[(32, 32), (24, 33), (32, 25)], - extra_dims=((4,),), - ): - bboxes = bboxes.to(device) - bboxes_format = bboxes.format - bboxes_image_size = bboxes.image_size - - output_boxes = F.center_crop_bounding_box(bboxes, bboxes_format, output_size, bboxes_image_size) - - if bboxes.ndim < 2: - bboxes = [bboxes] - - expected_bboxes = [] - for bbox in bboxes: - bbox = features.BoundingBox(bbox, format=bboxes_format, image_size=bboxes_image_size) - expected_bboxes.append(_compute_expected_bbox(bbox, output_size)) - - if len(expected_bboxes) > 1: - expected_bboxes = torch.stack(expected_bboxes) - else: - expected_bboxes = expected_bboxes[0] - torch.testing.assert_close(output_boxes, expected_bboxes) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("output_size", [[4, 2], [4], [7, 6]]) -def test_correctness_center_crop_segmentation_mask(device, output_size): - def _compute_expected_segmentation_mask(mask, output_size): - crop_height, crop_width = output_size if len(output_size) > 1 else [output_size[0], output_size[0]] - - _, image_height, image_width = mask.shape - if crop_width > image_height or crop_height > image_width: - padding = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) - mask = F.pad_image_tensor(mask, padding, fill=0) - - left = round((image_width - crop_width) * 0.5) - top = round((image_height - crop_height) * 0.5) - - return mask[:, top : top + crop_height, left : left + crop_width] - - mask = torch.randint(0, 2, size=(1, 6, 6), dtype=torch.long, device=device) - actual = F.center_crop_segmentation_mask(mask, output_size) - - expected = _compute_expected_segmentation_mask(mask, output_size) - torch.testing.assert_close(expected, actual) - - -# Copied from test/test_functional_tensor.py -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("image_size", ("small", "large")) -@pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) -@pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)]) -def test_correctness_gaussian_blur_image_tensor(device, image_size, dt, ksize, sigma): - fn = F.gaussian_blur_image_tensor - - # true_cv2_results = { - # # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3)) - # # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8) - # "3_3_0.8": ... - # # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5) - # "3_3_0.5": ... - # # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8) - # "3_5_0.8": ... - # # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5) - # "3_5_0.5": ... - # # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28)) - # # cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7) - # "23_23_1.7": ... - # } - p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") - true_cv2_results = torch.load(p) - - if image_size == "small": - tensor = ( - torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device) - ) - else: - tensor = torch.from_numpy(np.arange(26 * 28, dtype="uint8").reshape((1, 26, 28))).to(device) - - if dt == torch.float16 and device == "cpu": - # skip float16 on CPU case - return - - if dt is not None: - tensor = tensor.to(dtype=dt) - - _ksize = (ksize, ksize) if isinstance(ksize, int) else ksize - _sigma = sigma[0] if sigma is not None else None - shape = tensor.shape - gt_key = f"{shape[-2]}_{shape[-1]}_{shape[-3]}__{_ksize[0]}_{_ksize[1]}_{_sigma}" - if gt_key not in true_cv2_results: - return - - true_out = ( - torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor) - ) - - image = features.Image(tensor) - - out = fn(image, kernel_size=ksize, sigma=sigma) - torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}") - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize( - "fn, make_samples", [(F.elastic_image_tensor, make_images), (F.elastic_segmentation_mask, make_segmentation_masks)] -) -def test_correctness_elastic_image_or_mask_tensor(device, fn, make_samples): - in_box = [10, 15, 25, 35] - for sample in make_samples(sizes=((64, 76),), extra_dims=((), (4,))): - c, h, w = sample.shape[-3:] - # Setup a dummy image with 4 points - sample[..., in_box[1], in_box[0]] = torch.tensor([12, 34, 96, 112])[:c] - sample[..., in_box[3] - 1, in_box[0]] = torch.tensor([12, 34, 96, 112])[:c] - sample[..., in_box[3] - 1, in_box[2] - 1] = torch.tensor([12, 34, 96, 112])[:c] - sample[..., in_box[1], in_box[2] - 1] = torch.tensor([12, 34, 96, 112])[:c] - sample = sample.to(device) - - if fn == F.elastic_image_tensor: - sample = features.Image(sample) - kwargs = {"interpolation": F.InterpolationMode.NEAREST} - else: - sample = features.SegmentationMask(sample) - kwargs = {} - - # Create a displacement grid using sin - n, m = 5.0, 0.1 - d1 = m * torch.sin(torch.arange(h, dtype=torch.float) * torch.pi * n / h) - d2 = m * torch.sin(torch.arange(w, dtype=torch.float) * torch.pi * n / w) - - d1 = d1[:, None].expand((h, w)) - d2 = d2[None, :].expand((h, w)) - - displacement = torch.cat([d1[..., None], d2[..., None]], dim=-1) - displacement = displacement.reshape(1, h, w, 2) - - output = fn(sample, displacement=displacement, **kwargs) - - # Check places where transformed points should be - torch.testing.assert_close(output[..., 12, 9], sample[..., in_box[1], in_box[0]]) - torch.testing.assert_close(output[..., 17, 27], sample[..., in_box[1], in_box[2] - 1]) - torch.testing.assert_close(output[..., 31, 6], sample[..., in_box[3] - 1, in_box[0]]) - torch.testing.assert_close(output[..., 37, 23], sample[..., in_box[3] - 1, in_box[2] - 1]) - - -def test_midlevel_normalize_output_type(): - inpt = torch.rand(1, 3, 32, 32) - output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0)) - assert isinstance(output, torch.Tensor) - torch.testing.assert_close(inpt - 0.5, output) - - inpt = make_segmentation_mask() - output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0)) - assert isinstance(output, features.SegmentationMask) - torch.testing.assert_close(inpt, output) - - inpt = make_bounding_box(format="XYXY") - output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0)) - assert isinstance(output, features.BoundingBox) - torch.testing.assert_close(inpt, output) - - inpt = make_image(color_space=features.ColorSpace.RGB) - output = F.normalize(inpt, mean=(0.5, 0.5, 0.5), std=(1.0, 1.0, 1.0)) - assert isinstance(output, torch.Tensor) - torch.testing.assert_close(inpt - 0.5, output) - - -@pytest.mark.parametrize( - "inpt", - [ - torch.randint(0, 256, size=(3, 32, 32)), - 127 * np.ones((32, 32, 3), dtype="uint8"), - PIL.Image.new("RGB", (32, 32), 122), - ], -) -@pytest.mark.parametrize("copy", [True, False]) -def test_to_image_tensor(inpt, copy): - output = F.to_image_tensor(inpt, copy=copy) - assert isinstance(output, torch.Tensor) - - assert np.asarray(inpt).sum() == output.sum().item() - - if isinstance(inpt, PIL.Image.Image) and not copy: - # we can't check this option - # as PIL -> numpy is always copying - return - - if isinstance(inpt, PIL.Image.Image): - inpt.putpixel((0, 0), 11) - else: - inpt[0, 0, 0] = 11 - if copy: - assert output[0, 0, 0] != 11 - else: - assert output[0, 0, 0] == 11 - - -@pytest.mark.parametrize( - "inpt", - [ - torch.randint(0, 256, size=(3, 32, 32), dtype=torch.uint8), - 127 * np.ones((32, 32, 3), dtype="uint8"), - PIL.Image.new("RGB", (32, 32), 122), - ], -) -@pytest.mark.parametrize("mode", [None, "RGB"]) -def test_to_image_pil(inpt, mode): - output = F.to_image_pil(inpt, mode=mode) - assert isinstance(output, PIL.Image.Image) - - assert np.asarray(inpt).sum() == np.asarray(output).sum() diff --git a/test/test_prototype_transforms_utils.py b/test/test_prototype_transforms_utils.py deleted file mode 100644 index b83c4f3acb9..00000000000 --- a/test/test_prototype_transforms_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -import PIL.Image -import pytest - -import torch - -from test_prototype_transforms_functional import make_bounding_box, make_image, make_segmentation_mask - -from torchvision.prototype import features -from torchvision.prototype.transforms._utils import has_all, has_any, is_simple_tensor -from torchvision.prototype.transforms.functional import to_image_pil - - -IMAGE = make_image(color_space=features.ColorSpace.RGB) -BOUNDING_BOX = make_bounding_box(format=features.BoundingBoxFormat.XYXY, image_size=IMAGE.image_size) -SEGMENTATION_MASK = make_segmentation_mask(size=IMAGE.image_size) - - -@pytest.mark.parametrize( - ("sample", "types", "expected"), - [ - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image,), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox,), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.SegmentationMask,), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.SegmentationMask), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox, features.SegmentationMask), True), - ((SEGMENTATION_MASK,), (features.Image, features.BoundingBox), False), - ((BOUNDING_BOX,), (features.Image, features.SegmentationMask), False), - ((IMAGE,), (features.BoundingBox, features.SegmentationMask), False), - ( - (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), - (features.Image, features.BoundingBox, features.SegmentationMask), - True, - ), - ((), (features.Image, features.BoundingBox, features.SegmentationMask), False), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda obj: isinstance(obj, features.Image),), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: False,), False), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: True,), True), - ((IMAGE,), (features.Image, PIL.Image.Image, is_simple_tensor), True), - ((torch.Tensor(IMAGE),), (features.Image, PIL.Image.Image, is_simple_tensor), True), - ((to_image_pil(IMAGE),), (features.Image, PIL.Image.Image, is_simple_tensor), True), - ], -) -def test_has_any(sample, types, expected): - assert has_any(sample, *types) is expected - - -@pytest.mark.parametrize( - ("sample", "types", "expected"), - [ - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image,), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox,), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.SegmentationMask,), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.SegmentationMask), True), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (features.BoundingBox, features.SegmentationMask), True), - ( - (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), - (features.Image, features.BoundingBox, features.SegmentationMask), - True, - ), - ((BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox), False), - ((BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.SegmentationMask), False), - ((IMAGE, SEGMENTATION_MASK), (features.BoundingBox, features.SegmentationMask), False), - ( - (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), - (features.Image, features.BoundingBox, features.SegmentationMask), - True, - ), - ((BOUNDING_BOX, SEGMENTATION_MASK), (features.Image, features.BoundingBox, features.SegmentationMask), False), - ((IMAGE, SEGMENTATION_MASK), (features.Image, features.BoundingBox, features.SegmentationMask), False), - ((IMAGE, BOUNDING_BOX), (features.Image, features.BoundingBox, features.SegmentationMask), False), - ( - (IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), - (lambda obj: isinstance(obj, (features.Image, features.BoundingBox, features.SegmentationMask)),), - True, - ), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: False,), False), - ((IMAGE, BOUNDING_BOX, SEGMENTATION_MASK), (lambda _: True,), True), - ], -) -def test_has_all(sample, types, expected): - assert has_all(sample, *types) is expected diff --git a/test/test_transforms.py b/test/test_transforms.py index 6ec670ffa78..d93800d59bc 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -2,17 +2,18 @@ import os import random import re +import sys from functools import partial import numpy as np import pytest import torch import torchvision.transforms as transforms -import torchvision.transforms._pil_constants as _pil_constants +import torchvision.transforms._functional_tensor as F_t import torchvision.transforms.functional as F -import torchvision.transforms.functional_tensor as F_t from PIL import Image from torch._utils_internal import get_file_path_2 +from torchvision.utils import _Image_fromarray try: import accimage @@ -174,7 +175,7 @@ def test_accimage_pil_to_tensor(self): def test_accimage_resize(self): trans = transforms.Compose( [ - transforms.Resize(256, interpolation=_pil_constants.LINEAR), + transforms.Resize(256, interpolation=Image.LINEAR), transforms.PILToTensor(), transforms.ConvertImageDtype(dtype=torch.float), ] @@ -319,7 +320,7 @@ def test_randomresized_params(): scale_range = (scale_min, scale_min + round(random.random(), 2)) aspect_min = max(round(random.random(), 2), epsilon) aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2)) - randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range) + randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range, antialias=True) i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range) aspect_ratio_obtained = w / h assert ( @@ -366,7 +367,7 @@ def test_randomresized_params(): def test_resize(height, width, osize, max_size): img = Image.new("RGB", size=(width, height), color=127) - t = transforms.Resize(osize, max_size=max_size) + t = transforms.Resize(osize, max_size=max_size, antialias=True) result = t(img) msg = f"{height}, {width} - {osize} - {max_size}" @@ -424,7 +425,7 @@ def test_resize_sequence_output(height, width, osize): img = Image.new("RGB", size=(width, height), color=127) oheight, owidth = osize - t = transforms.Resize(osize) + t = transforms.Resize(osize, antialias=True) result = t(img) assert (owidth, oheight) == result.size @@ -447,11 +448,21 @@ def test_resize_size_equals_small_edge_size(height, width): img = Image.new("RGB", size=(width, height), color=127) small_edge = min(height, width) - t = transforms.Resize(small_edge, max_size=max_size) + t = transforms.Resize(small_edge, max_size=max_size, antialias=True) result = t(img) assert max(result.size) == max_size +def test_resize_equal_input_output_sizes(): + # Regression test for https://github.com/pytorch/vision/issues/7518 + height, width = 28, 27 + img = Image.new("RGB", size=(width, height)) + + t = transforms.Resize((height, width), antialias=True) + result = t(img) + assert result is img + + class TestPad: @pytest.mark.parametrize("fill", [85, 85.0]) def test_pad(self, fill): @@ -605,7 +616,7 @@ def _get_1_channel_tensor_various_types(): img_data_short = torch.ShortTensor(1, 4, 4).random_() expected_output = img_data_short.numpy() - yield img_data_short, expected_output, "I;16" + yield img_data_short, expected_output, "I;16" if sys.byteorder == "little" else "I;16B" img_data_int = torch.IntTensor(1, 4, 4).random_() expected_output = img_data_int.numpy() @@ -622,7 +633,7 @@ def _get_2d_tensor_various_types(): img_data_short = torch.ShortTensor(4, 4).random_() expected_output = img_data_short.numpy() - yield img_data_short, expected_output, "I;16" + yield img_data_short, expected_output, "I;16" if sys.byteorder == "little" else "I;16B" img_data_int = torch.IntTensor(4, 4).random_() expected_output = img_data_int.numpy() @@ -644,16 +655,16 @@ def test_1_channel_float_tensor_to_pil_image(self): img_F_mode = transforms.ToPILImage(mode="F")(img_data) assert img_F_mode.mode == "F" torch.testing.assert_close( - np.array(Image.fromarray(img_data.squeeze(0).numpy(), mode="F")), np.array(img_F_mode) + np.array(_Image_fromarray(img_data.squeeze(0).numpy(), mode="F")), np.array(img_F_mode) ) @pytest.mark.parametrize("with_mode", [False, True]) @pytest.mark.parametrize( "img_data, expected_mode", [ - (torch.Tensor(4, 4, 1).uniform_().numpy(), "F"), + (torch.Tensor(4, 4, 1).uniform_().numpy(), "L"), (torch.ByteTensor(4, 4, 1).random_(0, 255).numpy(), "L"), - (torch.ShortTensor(4, 4, 1).random_().numpy(), "I;16"), + (torch.ShortTensor(4, 4, 1).random_().numpy(), "I;16" if sys.byteorder == "little" else "I;16B"), (torch.IntTensor(4, 4, 1).random_().numpy(), "I"), ], ) @@ -661,6 +672,8 @@ def test_1_channel_ndarray_to_pil_image(self, with_mode, img_data, expected_mode transform = transforms.ToPILImage(mode=expected_mode) if with_mode else transforms.ToPILImage() img = transform(img_data) assert img.mode == expected_mode + if np.issubdtype(img_data.dtype, np.floating): + img_data = (img_data * 255).astype(np.uint8) # note: we explicitly convert img's dtype because pytorch doesn't support uint16 # and otherwise assert_close wouldn't be able to construct a tensor from the uint16 array torch.testing.assert_close(img_data[:, :, 0], np.asarray(img).astype(img_data.dtype)) @@ -731,9 +744,9 @@ def test_2d_tensor_to_pil_image(self, with_mode, img_data, expected_output, expe @pytest.mark.parametrize( "img_data, expected_mode", [ - (torch.Tensor(4, 4).uniform_().numpy(), "F"), + (torch.Tensor(4, 4).uniform_().numpy(), "L"), (torch.ByteTensor(4, 4).random_(0, 255).numpy(), "L"), - (torch.ShortTensor(4, 4).random_().numpy(), "I;16"), + (torch.ShortTensor(4, 4).random_().numpy(), "I;16" if sys.byteorder == "little" else "I;16B"), (torch.IntTensor(4, 4).random_().numpy(), "I"), ], ) @@ -741,6 +754,8 @@ def test_2d_ndarray_to_pil_image(self, with_mode, img_data, expected_mode): transform = transforms.ToPILImage(mode=expected_mode) if with_mode else transforms.ToPILImage() img = transform(img_data) assert img.mode == expected_mode + if np.issubdtype(img_data.dtype, np.floating): + img_data = (img_data * 255).astype(np.uint8) np.testing.assert_allclose(img_data, img) @pytest.mark.parametrize("expected_mode", [None, "RGB", "HSV", "YCbCr"]) @@ -864,8 +879,6 @@ def test_ndarray_bad_types_to_pil_image(self): trans(np.ones([4, 4, 1], np.uint16)) with pytest.raises(TypeError, match=reg_msg): trans(np.ones([4, 4, 1], np.uint32)) - with pytest.raises(TypeError, match=reg_msg): - trans(np.ones([4, 4, 1], np.float64)) with pytest.raises(ValueError, match=r"pic should be 2/3 dimensional. Got \d+ dimensions."): transforms.ToPILImage()(np.ones([1, 4, 4, 3])) @@ -883,7 +896,7 @@ def test_adjust_brightness(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") # test 0 y_pil = F.adjust_brightness(x_pil, 1) @@ -909,7 +922,7 @@ def test_adjust_contrast(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") # test 0 y_pil = F.adjust_contrast(x_pil, 1) @@ -931,38 +944,11 @@ def test_adjust_contrast(): torch.testing.assert_close(y_np, y_ans) -@pytest.mark.skipif(Image.__version__ >= "7", reason="Temporarily disabled") -def test_adjust_saturation(): - x_shape = [2, 2, 3] - x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] - x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") - - # test 0 - y_pil = F.adjust_saturation(x_pil, 1) - y_np = np.array(y_pil) - torch.testing.assert_close(y_np, x_np) - - # test 1 - y_pil = F.adjust_saturation(x_pil, 0.5) - y_np = np.array(y_pil) - y_ans = [2, 4, 8, 87, 128, 173, 39, 25, 138, 133, 215, 88] - y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape) - torch.testing.assert_close(y_np, y_ans) - - # test 2 - y_pil = F.adjust_saturation(x_pil, 2) - y_np = np.array(y_pil) - y_ans = [0, 6, 22, 0, 149, 255, 32, 0, 255, 4, 255, 0] - y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape) - torch.testing.assert_close(y_np, y_ans) - - def test_adjust_hue(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") with pytest.raises(ValueError): F.adjust_hue(x_pil, -0.7) @@ -1044,7 +1030,7 @@ def test_adjust_sharpness(): 117, ] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") # test 0 y_pil = F.adjust_sharpness(x_pil, 1) @@ -1167,7 +1153,7 @@ def test_adjust_sharpness(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") x_th = torch.tensor(x_np.transpose(2, 0, 1)) y_pil = F.adjust_sharpness(x_pil, 2) y_np = np.array(y_pil).transpose(2, 0, 1) @@ -1179,7 +1165,7 @@ def test_adjust_gamma(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") # test 0 y_pil = F.adjust_gamma(x_pil, 1) @@ -1205,7 +1191,7 @@ def test_adjusts_L_mode(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_rgb = Image.fromarray(x_np, mode="RGB") + x_rgb = _Image_fromarray(x_np, mode="RGB") x_l = x_rgb.convert("L") assert F.adjust_brightness(x_l, 2).mode == "L" @@ -1335,7 +1321,7 @@ def test_to_grayscale(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") x_pil_2 = x_pil.convert("L") gray_np = np.array(x_pil_2) @@ -1424,17 +1410,17 @@ def test_random_choice(proba_passthrough, seed): def test_random_order(): random_state = random.getstate() random.seed(42) - random_order_transform = transforms.RandomOrder([transforms.Resize(20), transforms.CenterCrop(10)]) + random_order_transform = transforms.RandomOrder([transforms.Resize(20, antialias=True), transforms.CenterCrop(10)]) img = transforms.ToPILImage()(torch.rand(3, 25, 25)) num_samples = 250 num_normal_order = 0 - resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20)(img)) + resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20, antialias=True)(img)) for _ in range(num_samples): out = random_order_transform(img) if out == resize_crop_out: num_normal_order += 1 - p_value = stats.binom_test(num_normal_order, num_samples, p=0.5) + p_value = stats.binomtest(num_normal_order, num_samples, p=0.5).pvalue random.setstate(random_state) assert p_value > 0.0001 @@ -1522,10 +1508,10 @@ def test_ten_crop(should_vflip, single_dim): five_crop.__repr__() if should_vflip: - vflipped_img = img.transpose(_pil_constants.FLIP_TOP_BOTTOM) + vflipped_img = img.transpose(Image.FLIP_TOP_BOTTOM) expected_output += five_crop(vflipped_img) else: - hflipped_img = img.transpose(_pil_constants.FLIP_LEFT_RIGHT) + hflipped_img = img.transpose(Image.FLIP_LEFT_RIGHT) expected_output += five_crop(hflipped_img) assert len(results) == 10 @@ -1629,8 +1615,8 @@ def test_augmix(fill, severity, mixture_width, chain_depth, all_ops, grayscale): def test_random_crop(): height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 - oheight = random.randint(5, (height - 2) / 2) * 2 - owidth = random.randint(5, (width - 2) / 2) * 2 + oheight = random.randint(5, (height - 2) // 2) * 2 + owidth = random.randint(5, (width - 2) // 2) * 2 img = torch.ones(3, height, width, dtype=torch.uint8) result = transforms.Compose( [ @@ -1670,7 +1656,7 @@ def test_random_crop(): assert result.size(1) == height + 1 assert result.size(2) == width + 1 - t = transforms.RandomCrop(48) + t = transforms.RandomCrop(33) img = torch.ones(3, 32, 32) with pytest.raises(ValueError, match=r"Required crop size .+ is larger than input image size .+"): t(img) @@ -1679,8 +1665,8 @@ def test_random_crop(): def test_center_crop(): height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 - oheight = random.randint(5, (height - 2) / 2) * 2 - owidth = random.randint(5, (width - 2) / 2) * 2 + oheight = random.randint(5, (height - 2) // 2) * 2 + owidth = random.randint(5, (width - 2) // 2) * 2 img = torch.ones(3, height, width, dtype=torch.uint8) oh1 = (height - oheight) // 2 @@ -1784,7 +1770,7 @@ def test_color_jitter(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") + x_pil = _Image_fromarray(x_np, mode="RGB") x_pil_2 = x_pil.convert("L") for _ in range(10): @@ -1798,6 +1784,12 @@ def test_color_jitter(): color_jitter.__repr__() +@pytest.mark.parametrize("hue", [1, (-1, 1)]) +def test_color_jitter_hue_out_of_bounds(hue): + with pytest.raises(ValueError, match=re.escape("hue values should be between (-0.5, 0.5)")): + transforms.ColorJitter(hue=hue) + + @pytest.mark.parametrize("seed", range(10)) @pytest.mark.skipif(stats is None, reason="scipy.stats not available") def test_random_erasing(seed): @@ -1818,7 +1810,7 @@ def test_random_erasing(seed): tol = 0.05 assert 1 / 3 - tol <= aspect_ratio <= 3 + tol - # Make sure that h > w and h < w are equaly likely (log-scale sampling) + # Make sure that h > w and h < w are equally likely (log-scale sampling) aspect_ratios = [] random.seed(42) trial = 1000 @@ -1834,7 +1826,7 @@ def test_random_erasing(seed): aspect_ratios.append(h / w) count_bigger_then_ones = len([1 for aspect_ratio in aspect_ratios if aspect_ratio > 1]) - p_value = stats.binom_test(count_bigger_then_ones, trial, p=0.5) + p_value = stats.binomtest(count_bigger_then_ones, trial, p=0.5).pvalue assert p_value > 0.0001 # Checking if RandomErasing can be printed as string @@ -1866,27 +1858,8 @@ def test_random_rotation(): # Checking if RandomRotation can be printed as string t.__repr__() - # assert deprecation warning and non-BC - with pytest.warns( - UserWarning, - match=re.escape( - "The parameter 'resample' is deprecated since 0.12 and will be removed 0.14. " - "Please use 'interpolation' instead." - ), - ): - t = transforms.RandomRotation((-10, 10), resample=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR - - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - t = transforms.RandomRotation((-10, 10), interpolation=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR + t = transforms.RandomRotation((-10, 10), interpolation=Image.BILINEAR) + assert t.interpolation == transforms.InterpolationMode.BILINEAR def test_random_rotation_error(): @@ -2078,7 +2051,7 @@ def _test_transformation(self, angle, translate, scale, shear, pil_image, input_ # https://github.com/python-pillow/Pillow/blob/71f8ec6a0cfc1008076a023c0756542539d057ab/ # src/libImaging/Geometry.c#L1060 input_pt = np.array([x + 0.5, y + 0.5, 1.0]) - res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(np.int) + res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(int) _x, _y = res[:2] if 0 <= _x < input_img.shape[1] and 0 <= _y < input_img.shape[0]: true_result[y, x, :] = input_img[_y, _x, :] @@ -2217,37 +2190,8 @@ def test_random_affine(): t = transforms.RandomAffine(10, interpolation=transforms.InterpolationMode.BILINEAR) assert "bilinear" in t.__repr__() - # assert deprecation warning and non-BC - with pytest.warns( - UserWarning, - match=re.escape( - "The parameter 'resample' is deprecated since 0.12 and will be removed in 0.14. " - "Please use 'interpolation' instead." - ), - ): - t = transforms.RandomAffine(10, resample=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR - - with pytest.warns( - UserWarning, - match=re.escape( - "The parameter 'fillcolor' is deprecated since 0.12 and will be removed in 0.14. " - "Please use 'fill' instead." - ), - ): - t = transforms.RandomAffine(10, fillcolor=10) - assert t.fill == 10 - - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - t = transforms.RandomAffine(10, interpolation=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR + t = transforms.RandomAffine(10, interpolation=Image.BILINEAR) + assert t.interpolation == transforms.InterpolationMode.BILINEAR def test_elastic_transformation(): @@ -2265,9 +2209,8 @@ def test_elastic_transformation(): with pytest.raises(ValueError, match=r"sigma is a sequence its length should be 2"): transforms.ElasticTransform(alpha=2.0, sigma=[1.0, 0.0, 1.0]) - with pytest.warns(UserWarning, match=r"Argument interpolation should be of type InterpolationMode"): - t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR + t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=Image.BILINEAR) + assert t.interpolation == transforms.InterpolationMode.BILINEAR with pytest.raises(TypeError, match=r"fill should be int or float"): transforms.ElasticTransform(alpha=1.0, sigma=1.0, fill={}) @@ -2287,5 +2230,17 @@ def test_elastic_transformation(): t.__repr__() +def test_random_grayscale_with_grayscale_input(): + transform = transforms.RandomGrayscale(p=1.0) + + image_tensor = torch.randint(0, 256, (1, 16, 16), dtype=torch.uint8) + output_tensor = transform(image_tensor) + torch.testing.assert_close(output_tensor, image_tensor) + + image_pil = F.to_pil_image(image_tensor) + output_pil = transform(image_pil) + torch.testing.assert_close(F.pil_to_tensor(output_pil), image_tensor) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index f4ca544deb8..eac52dafc17 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -2,16 +2,16 @@ import sys import numpy as np +import PIL.Image import pytest import torch -import torchvision.transforms._pil_constants as _pil_constants from common_utils import ( _assert_approx_equal_tensor_to_pil, _assert_equal_tensor_to_pil, _create_data, _create_data_batch, assert_equal, - cpu_and_gpu, + cpu_and_cuda, float_dtypes, get_tmp_dir, int_dtypes, @@ -20,7 +20,12 @@ from torchvision.transforms import functional as F, InterpolationMode from torchvision.transforms.autoaugment import _apply_op -NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC +NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = ( + InterpolationMode.NEAREST, + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR, + InterpolationMode.BICUBIC, +) def _test_transform_vs_scripted(transform, s_transform, tensor, msg=None): @@ -94,12 +99,12 @@ def _test_op(func, method, device, channels=3, fn_kwargs=None, meth_kwargs=None, def _test_fn_save_load(fn, tmpdir): scripted_fn = torch.jit.script(fn) - p = os.path.join(tmpdir, f"t_op_list_{fn.__name__ if hasattr(fn, '__name__') else fn.__class__.__name__}.pt") + p = os.path.join(tmpdir, f"t_op_list_{getattr(fn, '__name__', fn.__class__.__name__)}.pt") scripted_fn.save(p) _ = torch.jit.load(p) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "func,method,fn_kwargs,match_kwargs", [ @@ -124,7 +129,7 @@ def test_random(func, method, device, channels, fn_kwargs, match_kwargs): @pytest.mark.parametrize("seed", range(10)) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("channels", [1, 3]) class TestColorJitter: @pytest.fixture(autouse=True) @@ -200,7 +205,7 @@ def test_color_jitter_all(self, device, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("m", ["constant", "edge", "reflect", "symmetric"]) @pytest.mark.parametrize("mul", [1, -1]) def test_pad(m, mul, device): @@ -223,7 +228,7 @@ def test_pad(m, mul, device): _test_op(F.pad, T.Pad, device=device, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_crop(device): fn_kwargs = {"top": 2, "left": 3, "height": 4, "width": 5} # Test transforms.RandomCrop with size and padding as tuple @@ -251,7 +256,7 @@ def test_crop(device): _test_functional_op(F.crop, fn_kwargs=fn_kwargs, device=device) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "padding_config", [ @@ -277,7 +282,7 @@ def test_random_crop_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_center_crop(device, tmpdir): fn_kwargs = {"output_size": (4, 5)} meth_kwargs = {"size": (4, 5)} @@ -307,7 +312,7 @@ def test_center_crop_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "fn, method, out_length", [ @@ -366,7 +371,7 @@ class TestResize: def test_resize_int(self, size): # TODO: Minimal check for bug-fix, improve this later x = torch.rand(3, 32, 46) - t = T.Resize(size=size) + t = T.Resize(size=size, antialias=True) y = t(x) # If size is an int, smaller edge of the image will be matched to this number. # i.e, if height > width, then image will be rescaled to (size * height / width, size). @@ -374,11 +379,11 @@ def test_resize_int(self, size): assert y.shape[1] == size assert y.shape[2] == int(size * 46 / 32) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64]) @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]]) @pytest.mark.parametrize("max_size", [None, 35, 1000]) - @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST]) + @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT]) def test_resize_scripted(self, dt, size, max_size, interpolation, device): tensor, _ = _create_data(height=34, width=36, device=device) batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device) @@ -389,25 +394,25 @@ def test_resize_scripted(self, dt, size, max_size, interpolation, device): if max_size is not None and len(size) != 1: pytest.skip("Size should be an int or a sequence of length 1 if max_size is specified") - transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size) + transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size, antialias=True) s_transform = torch.jit.script(transform) _test_transform_vs_scripted(transform, s_transform, tensor) _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) def test_resize_save_load(self, tmpdir): - fn = T.Resize(size=[32]) + fn = T.Resize(size=[32], antialias=True) _test_fn_save_load(fn, tmpdir) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]]) @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]]) @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]]) - @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC]) + @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC, NEAREST_EXACT]) @pytest.mark.parametrize("antialias", [None, True, False]) def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device): - if antialias and interpolation == NEAREST: - pytest.skip("Can not resize if interpolation mode is NEAREST and antialias=True") + if antialias and interpolation in {NEAREST, NEAREST_EXACT}: + pytest.skip(f"Can not resize if interpolation mode is {interpolation} and antialias=True") tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device) @@ -419,7 +424,7 @@ def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) def test_resized_crop_save_load(self, tmpdir): - fn = T.RandomResizedCrop(size=[32]) + fn = T.RandomResizedCrop(size=[32], antialias=True) _test_fn_save_load(fn, tmpdir) @@ -438,42 +443,42 @@ def test_random_affine_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("shear", [15, 10.0, (5.0, 10.0), [-15, 15], [-10.0, 10.0, -11.0, 11.0]]) def test_random_affine_shear(device, interpolation, shear): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, shear=shear) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]]) def test_random_affine_scale(device, interpolation, scale): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, scale=scale) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("translate", [(0.1, 0.2), [0.2, 0.1]]) def test_random_affine_translate(device, interpolation, translate): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, translate=translate) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]]) def test_random_affine_degrees(device, interpolation, degrees): _test_random_affine_helper(device, degrees=degrees, interpolation=interpolation) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_random_affine_fill(device, interpolation, fill): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, fill=fill) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("center", [(0, 0), [10, 10], None, (56, 44)]) @pytest.mark.parametrize("expand", [True, False]) @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]]) @@ -495,7 +500,7 @@ def test_random_rotate_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("distortion_scale", np.linspace(0.1, 1.0, num=20)) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) @@ -515,7 +520,7 @@ def test_random_perspective_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "Klass, meth_kwargs", [(T.Grayscale, {"num_output_channels": 1}), (T.Grayscale, {"num_output_channels": 3}), (T.RandomGrayscale, {})], @@ -525,7 +530,7 @@ def test_to_grayscale(device, Klass, meth_kwargs): _test_class_op(Klass, meth_kwargs=meth_kwargs, test_exact_match=False, device=device, tol=tol, agg_method="max") -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("in_dtype", int_dtypes() + float_dtypes()) @pytest.mark.parametrize("out_dtype", int_dtypes() + float_dtypes()) def test_convert_image_dtype(device, in_dtype, out_dtype): @@ -556,7 +561,7 @@ def test_convert_image_dtype_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("policy", [policy for policy in T.AutoAugmentPolicy]) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_autoaugment(device, policy, fill): @@ -570,7 +575,7 @@ def test_autoaugment(device, policy, fill): _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("num_ops", [1, 2, 3]) @pytest.mark.parametrize("magnitude", [7, 9, 11]) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) @@ -585,7 +590,7 @@ def test_randaugment(device, num_ops, magnitude, fill): _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_trivialaugmentwide(device, fill): tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) @@ -598,7 +603,7 @@ def test_trivialaugmentwide(device, fill): _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_augmix(device, fill): tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) @@ -635,13 +640,13 @@ def shear(pil_img, level, mode, resample): matrix = (1, level, 0, 0, 1, 0) elif mode == "Y": matrix = (1, 0, 0, level, 1, 0) - return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample) + return pil_img.transform((image_size, image_size), PIL.Image.AFFINE, matrix, resample=resample) t_img, pil_img = _create_data(image_size, image_size) resample_pil = { - F.InterpolationMode.NEAREST: _pil_constants.NEAREST, - F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR, + F.InterpolationMode.NEAREST: PIL.Image.NEAREST, + F.InterpolationMode.BILINEAR: PIL.Image.BILINEAR, }[interpolation] level = 0.3 @@ -664,10 +669,20 @@ def shear(pil_img, level, mode, resample): _assert_approx_equal_tensor_to_pil(out, expected_out) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "config", - [{"value": 0.2}, {"value": "random"}, {"value": (0.2, 0.2, 0.2)}, {"value": "random", "ratio": (0.1, 0.2)}], + [ + {}, + {"value": 1}, + {"value": 0.2}, + {"value": "random"}, + {"value": (1, 1, 1)}, + {"value": (0.2, 0.2, 0.2)}, + {"value": [1, 1, 1]}, + {"value": [0.2, 0.2, 0.2]}, + {"value": "random", "ratio": (0.1, 0.2)}, + ], ) def test_random_erasing(device, config): tensor, _ = _create_data(24, 32, channels=3, device=device) @@ -692,7 +707,7 @@ def test_random_erasing_with_invalid_data(): random_erasing(img) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_normalize(device, tmpdir): fn = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) tensor, _ = _create_data(26, 34, device=device) @@ -711,7 +726,7 @@ def test_normalize(device, tmpdir): scripted_fn.save(os.path.join(tmpdir, "t_norm.pt")) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_linear_transformation(device, tmpdir): c, h, w = 3, 24, 32 @@ -737,7 +752,7 @@ def test_linear_transformation(device, tmpdir): scripted_fn.save(os.path.join(tmpdir, "t_norm.pt")) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_compose(device): tensor, _ = _create_data(26, 34, device=device) tensor = tensor.to(dtype=torch.float32) / 255.0 @@ -765,7 +780,7 @@ def test_compose(device): torch.jit.script(t) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_random_apply(device): tensor, _ = _create_data(26, 34, device=device) tensor = tensor.to(dtype=torch.float32) / 255.0 @@ -807,7 +822,7 @@ def test_random_apply(device): torch.jit.script(transforms) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "meth_kwargs", [ @@ -843,3 +858,35 @@ def test_gaussian_blur(device, channels, meth_kwargs): agg_method="max", tol=tol, ) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize( + "fill", + [ + 1, + 1.0, + [1], + [1.0], + (1,), + (1.0,), + [1, 2, 3], + [1.0, 2.0, 3.0], + (1, 2, 3), + (1.0, 2.0, 3.0), + ], +) +@pytest.mark.parametrize("channels", [1, 3]) +def test_elastic_transform(device, channels, fill): + if isinstance(fill, (list, tuple)) and len(fill) > 1 and channels == 1: + # For this the test would correctly fail, since the number of channels in the image does not match `fill`. + # Thus, this is not an issue in the transform, but rather a problem of parametrization that just gives the + # product of `fill` and `channels`. + return + + _test_class_op( + T.ElasticTransform, + meth_kwargs=dict(fill=fill), + channels=channels, + device=device, + ) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py new file mode 100644 index 00000000000..9eb209f43fc --- /dev/null +++ b/test/test_transforms_v2.py @@ -0,0 +1,7501 @@ +import contextlib +import decimal +import functools +import inspect +import itertools +import math +import pickle +import random +import re +import sys +from copy import deepcopy +from pathlib import Path +from unittest import mock + +import numpy as np +import PIL.Image +import pytest + +import torch +import torchvision.ops +import torchvision.transforms.v2 as transforms + +from common_utils import ( + assert_equal, + cache, + cpu_and_cuda, + freeze_rng_state, + ignore_jit_no_profile_information_warning, + make_bounding_boxes, + make_detection_masks, + make_image, + make_image_pil, + make_image_tensor, + make_keypoints, + make_segmentation_mask, + make_video, + make_video_tensor, + needs_cuda, + set_rng_seed, +) + +from torch import nn +from torch.testing import assert_close +from torch.utils._pytree import tree_flatten, tree_map +from torch.utils.data import DataLoader, default_collate +from torchvision import tv_tensors +from torchvision.ops.boxes import box_iou + +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from torchvision.transforms.functional import pil_modes_mapping, to_pil_image +from torchvision.transforms.v2 import functional as F +from torchvision.transforms.v2._utils import check_type, is_pure_tensor +from torchvision.transforms.v2.functional._geometry import _get_perspective_coeffs, _parallelogram_to_bounding_boxes +from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal + + +# turns all warnings into errors for this module +pytestmark = [pytest.mark.filterwarnings("error")] + +if sys.version_info[:2] >= (3, 12): + # torchscript relies on some AST stuff that got deprecated in 3.12, + # so we have to explicitly ignore those otherwise we'd error on warnings due to the pytestmark filter above. + pytestmark.append(pytest.mark.filterwarnings("ignore::DeprecationWarning")) + + +@pytest.fixture(autouse=True) +def fix_rng_seed(): + set_rng_seed(0) + yield + + +def _to_tolerances(maybe_tolerance_dict): + if not isinstance(maybe_tolerance_dict, dict): + return dict(rtol=None, atol=None) + + tolerances = dict(rtol=0, atol=0) + tolerances.update(maybe_tolerance_dict) + return tolerances + + +def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel produces closes results for inputs on GPU and CPU.""" + if input.device.type != "cuda": + return + + input_cuda = input.as_subclass(torch.Tensor) + input_cpu = input_cuda.to("cpu") + + with freeze_rng_state(): + actual = kernel(input_cuda, *args, **kwargs) + with freeze_rng_state(): + expected = kernel(input_cpu, *args, **kwargs) + + assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol) + + +@cache +def _script(obj): + try: + return torch.jit.script(obj) + except Exception as error: + name = getattr(obj, "__name__", obj.__class__.__name__) + raise AssertionError(f"Trying to `torch.jit.script` `{name}` raised the error above.") from error + + +def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel is scriptable and if the scripted output is close to the eager one.""" + if input.device.type != "cpu": + return + + kernel_scripted = _script(kernel) + + input = input.as_subclass(torch.Tensor) + with ignore_jit_no_profile_information_warning(): + with freeze_rng_state(): + actual = kernel_scripted(input, *args, **kwargs) + with freeze_rng_state(): + expected = kernel(input, *args, **kwargs) + + assert_close(actual, expected, rtol=rtol, atol=atol) + + +def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel produces close results for batched and unbatched inputs.""" + unbatched_input = input.as_subclass(torch.Tensor) + + for batch_dims in [(2,), (2, 1)]: + repeats = [*batch_dims, *[1] * input.ndim] + + actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs) + + expected = kernel(unbatched_input, *args, **kwargs) + # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata + if isinstance(expected, torch.Tensor): + expected = expected.repeat(repeats) + else: + tensor, *metadata = expected + expected = (tensor.repeat(repeats), *metadata) + + assert_close(actual, expected, rtol=rtol, atol=atol) + + for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]: + degenerate_batched_input = torch.empty( + degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device + ) + + output = kernel(degenerate_batched_input, *args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + assert output.shape[: -input.ndim] == degenerate_batch_dims + + +def check_kernel( + kernel, + input, + *args, + check_cuda_vs_cpu=True, + check_scripted_vs_eager=True, + check_batched_vs_unbatched=True, + **kwargs, +): + initial_input_version = input._version + + output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + # check that no inplace operation happened + assert input._version == initial_input_version + + if kernel not in {F.to_dtype_image, F.to_dtype_video}: + assert output.dtype == input.dtype + assert output.device == input.device + + if check_cuda_vs_cpu: + _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu)) + + if check_scripted_vs_eager: + _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager)) + + if check_batched_vs_unbatched: + _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched)) + + +def _check_functional_scripted_smoke(functional, input, *args, **kwargs): + """Checks if the functional can be scripted and the scripted version can be called without error.""" + if not isinstance(input, tv_tensors.Image): + return + + functional_scripted = _script(functional) + with ignore_jit_no_profile_information_warning(): + functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs) + + +def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs): + unknown_input = object() + with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))): + functional(unknown_input, *args, **kwargs) + + with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy: + output = functional(input, *args, **kwargs) + + spy.assert_any_call(f"{functional.__module__}.{functional.__name__}") + + assert isinstance(output, type(input)) + + if isinstance(input, tv_tensors.BoundingBoxes) and functional is not F.convert_bounding_box_format: + assert output.format == input.format + + if check_scripted_smoke: + _check_functional_scripted_smoke(functional, input, *args, **kwargs) + + +def check_functional_kernel_signature_match(functional, *, kernel, input_type): + """Checks if the signature of the functional matches the kernel signature.""" + functional_params = list(inspect.signature(functional).parameters.values())[1:] + kernel_params = list(inspect.signature(kernel).parameters.values())[1:] + + if issubclass(input_type, tv_tensors.TVTensor): + # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be + # explicitly passed to the kernel. + explicit_metadata = {tv_tensors.BoundingBoxes: {"format", "canvas_size"}, tv_tensors.KeyPoints: {"canvas_size"}} + kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] + + functional_params = iter(functional_params) + for functional_param, kernel_param in zip(functional_params, kernel_params): + try: + # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out + # functional parameters that have no kernel equivalent while keeping the order intact. + while functional_param.name != kernel_param.name: + functional_param = next(functional_params) + except StopIteration: + raise AssertionError( + f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` " + f"has no corresponding parameter on the functional `{functional.__name__}`." + ) from None + + if issubclass(input_type, PIL.Image.Image): + # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check + # them in the first place. + functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty + + assert functional_param == kernel_param + + +def _check_transform_v1_compatibility(transform, input, *, rtol, atol): + """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static + ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version + can be called without error.""" + if not (type(input) is torch.Tensor or isinstance(input, PIL.Image.Image)): + return + + v1_transform_cls = transform._v1_transform_cls + if v1_transform_cls is None: + return + + if hasattr(v1_transform_cls, "get_params"): + assert type(transform).get_params is v1_transform_cls.get_params + + v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform()) + + with freeze_rng_state(): + output_v2 = transform(input) + + with freeze_rng_state(): + output_v1 = v1_transform(input) + + assert_close(F.to_image(output_v2), F.to_image(output_v1), rtol=rtol, atol=atol) + + if isinstance(input, PIL.Image.Image): + return + + _script(v1_transform)(input) + + +def _make_transform_sample(transform, *, image_or_video, adapter): + device = image_or_video.device if isinstance(image_or_video, torch.Tensor) else "cpu" + size = F.get_size(image_or_video) + input = dict( + image_or_video=image_or_video, + image_tv_tensor=make_image(size, device=device), + video_tv_tensor=make_video(size, device=device), + image_pil=make_image_pil(size), + bounding_boxes_xyxy=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.XYXY, device=device), + bounding_boxes_xywh=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.XYWH, device=device), + bounding_boxes_cxcywh=make_bounding_boxes(size, format=tv_tensors.BoundingBoxFormat.CXCYWH, device=device), + bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [2, 0, 1, 1], # x1 > x2, y1 < y2 + [0, 2, 1, 1], # x1 < x2, y1 > y2 + [2, 2, 1, 1], # x1 > x2, y1 > y2 + ], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=size, + device=device, + ), + bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.XYWH, + canvas_size=size, + device=device, + ), + bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.CXCYWH, + canvas_size=size, + device=device, + ), + keypoints=make_keypoints(canvas_size=size), + detection_mask=make_detection_masks(size, device=device), + segmentation_mask=make_segmentation_mask(size, device=device), + int=0, + float=0.0, + bool=True, + none=None, + str="str", + path=Path.cwd(), + object=object(), + tensor=torch.empty(5), + array=np.empty(5), + ) + if adapter is not None: + input = adapter(transform, input, device) + return input + + +def _check_transform_sample_input_smoke(transform, input, *, adapter): + # This is a bunch of input / output convention checks, using a big sample with different parts as input. + + if not check_type(input, (is_pure_tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video)): + return + + sample = _make_transform_sample( + # adapter might change transform inplace + transform=transform if adapter is None else deepcopy(transform), + image_or_video=input, + adapter=adapter, + ) + for container_type in [dict, list, tuple]: + if container_type is dict: + input = sample + else: + input = container_type(sample.values()) + + input_flat, input_spec = tree_flatten(input) + + with freeze_rng_state(): + torch.manual_seed(0) + output = transform(input) + output_flat, output_spec = tree_flatten(output) + + assert output_spec == input_spec + + for output_item, input_item, should_be_transformed in zip( + output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat) + ): + if should_be_transformed: + assert type(output_item) is type(input_item) + else: + assert output_item is input_item + + # Enforce that the transform does not turn a degenerate bounding box, e.g. marked by RandomIoUCrop (or any other + # future transform that does this), back into a valid one. + # TODO: We may want to do that for KeyPoints too + for degenerate_bounding_boxes in ( + bounding_box + for name, bounding_box in sample.items() + if "degenerate" in name and isinstance(bounding_box, tv_tensors.BoundingBoxes) + ): + sample = dict( + boxes=degenerate_bounding_boxes, + labels=torch.randint(10, (degenerate_bounding_boxes.shape[0],), device=degenerate_bounding_boxes.device), + ) + assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) + + +def check_transform(transform, input, check_v1_compatibility=True, check_sample_input=True): + pickle.loads(pickle.dumps(transform)) + + output = transform(input) + assert isinstance(output, type(input)) + + if isinstance(input, tv_tensors.BoundingBoxes) and not isinstance(transform, transforms.ConvertBoundingBoxFormat): + assert output.format == input.format + + if check_sample_input: + _check_transform_sample_input_smoke( + transform, input, adapter=check_sample_input if callable(check_sample_input) else None + ) + + if check_v1_compatibility: + _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility)) + + return output + + +def transform_cls_to_functional(transform_cls, **transform_specific_kwargs): + def wrapper(input, *args, **kwargs): + transform = transform_cls(*args, **transform_specific_kwargs, **kwargs) + return transform(input) + + wrapper.__name__ = transform_cls.__name__ + + return wrapper + + +def param_value_parametrization(**kwargs): + """Helper function to turn + + @pytest.mark.parametrize( + ("param", "value"), + ("a", 1), + ("a", 2), + ("a", 3), + ("b", -1.0) + ("b", 1.0) + ) + + into + + @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0]) + """ + return pytest.mark.parametrize( + ("param", "value"), + [(param, value) for param, values in kwargs.items() for value in values], + ) + + +def adapt_fill(value, *, dtype): + """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype""" + if value is None: + return value + + max_value = get_max_value(dtype) + value_type = float if dtype.is_floating_point else int + + if isinstance(value, (int, float)): + return value_type(value * max_value) + elif isinstance(value, (list, tuple)): + return type(value)(value_type(v * max_value) for v in value) + else: + raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.") + + +EXHAUSTIVE_TYPE_FILLS = [ + None, + 1, + 0.5, + [1], + [0.2], + (0,), + (0.7,), + [1, 0, 1], + [0.1, 0.2, 0.3], + (0, 1, 0), + (0.9, 0.234, 0.314), +] +CORRECTNESS_FILLS = [ + v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1) +] + + +# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well +INTERPOLATION_MODES = [ + transforms.InterpolationMode.NEAREST, + transforms.InterpolationMode.NEAREST_EXACT, + transforms.InterpolationMode.BILINEAR, + transforms.InterpolationMode.BICUBIC, +] + + +def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True): + format = bounding_boxes.format + canvas_size = new_canvas_size or bounding_boxes.canvas_size + clamping_mode = bounding_boxes.clamping_mode + + def affine_bounding_boxes(bounding_boxes): + dtype = bounding_boxes.dtype + device = bounding_boxes.device + + # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 + input_xyxy = F.convert_bounding_box_format( + bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXY, + inplace=True, + ) + x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() + + points = np.array( + [ + [x1, y1, 1.0], + [x2, y1, 1.0], + [x1, y2, 1.0], + [x2, y2, 1.0], + ] + ) + transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T) + + output_xyxy = torch.Tensor( + [ + float(np.min(transformed_points[:, 0])), + float(np.min(transformed_points[:, 1])), + float(np.max(transformed_points[:, 0])), + float(np.max(transformed_points[:, 1])), + ] + ) + + output = F.convert_bounding_box_format( + output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format + ) + + if clamp: + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + output = F.clamp_bounding_boxes( + output, + format=format, + canvas_size=canvas_size, + clamping_mode=clamping_mode, + ) + else: + # We leave the bounding box as float64 so the caller gets the full precision to perform any additional + # operation + dtype = output.dtype + + return output.to(dtype=dtype, device=device) + + return tv_tensors.BoundingBoxes( + torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( + bounding_boxes.shape + ), + format=format, + canvas_size=canvas_size, + clamping_mode=clamping_mode, + ) + + +def reference_affine_rotated_bounding_boxes_helper( + bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True, flip=False +): + format = bounding_boxes.format + canvas_size = new_canvas_size or bounding_boxes.canvas_size + clamping_mode = bounding_boxes.clamping_mode + + def affine_rotated_bounding_boxes(bounding_boxes): + dtype = bounding_boxes.dtype + device = bounding_boxes.device + + # Go to float before converting to prevent precision loss in case of CXCYWHR -> XYXYXYXY and W or H is 1 + input_xyxyxyxy = F.convert_bounding_box_format( + bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, + inplace=True, + ) + x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist() + + points = np.array( + [ + [x1, y1, 1.0], + [x2, y2, 1.0], + [x3, y3, 1.0], + [x4, y4, 1.0], + ] + ) + transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T) + output = torch.tensor( + [ + float(transformed_points[0, 0]), + float(transformed_points[0, 1]), + float(transformed_points[1, 0]), + float(transformed_points[1, 1]), + float(transformed_points[2, 0]), + float(transformed_points[2, 1]), + float(transformed_points[3, 0]), + float(transformed_points[3, 1]), + ] + ) + + output = output[[2, 3, 0, 1, 6, 7, 4, 5]] if flip else output + output = _parallelogram_to_bounding_boxes(output) + + output = F.convert_bounding_box_format( + output, old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, new_format=format + ) + + return ( + F.clamp_bounding_boxes( + output.to(dtype=dtype, device=device), + format=format, + canvas_size=canvas_size, + clamping_mode=clamping_mode, + ) + if clamp + else output.to(dtype=output.dtype, device=device) + ) + + return tv_tensors.BoundingBoxes( + torch.cat( + [ + affine_rotated_bounding_boxes(b) + for b in bounding_boxes.reshape( + -1, 5 if format != tv_tensors.BoundingBoxFormat.XYXYXYXY else 8 + ).unbind() + ], + dim=0, + ).reshape(bounding_boxes.shape), + format=format, + canvas_size=canvas_size, + clamping_mode=clamping_mode, + ) + + +def reference_affine_keypoints_helper(keypoints, *, affine_matrix, new_canvas_size=None, clamp=True): + canvas_size = new_canvas_size or keypoints.canvas_size + + def affine_keypoints(keypoints): + dtype = keypoints.dtype + device = keypoints.device + + # Go to float before converting to prevent precision loss + x, y = keypoints.to(dtype=torch.float64, device="cpu", copy=True).squeeze(0).tolist() + + points = np.array([[x, y, 1.0]]) + transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T) + + output = torch.Tensor( + [ + float(transformed_points[0, 0]), + float(transformed_points[0, 1]), + ] + ) + + if clamp: + output = F.clamp_keypoints(output, canvas_size=canvas_size) + else: + dtype = output.dtype + + return output.to(dtype=dtype, device=device) + + return tv_tensors.KeyPoints( + torch.cat([affine_keypoints(k) for k in keypoints.reshape(-1, 2).unbind()], dim=0).reshape(keypoints.shape), + canvas_size=canvas_size, + ) + + +class TestResize: + INPUT_SIZE = (17, 11) + OUTPUT_SIZES = [17, [17], (17,), None, [12, 13], (12, 13)] + + def _make_max_size_kwarg(self, *, use_max_size, size): + if size is None: + max_size = min(list(self.INPUT_SIZE)) + elif use_max_size: + if not (isinstance(size, int) or len(size) == 1): + # This would result in an `ValueError` + return None + + max_size = (size if isinstance(size, int) else size[0]) + 1 + else: + max_size = None + + return dict(max_size=max_size) + + def _compute_output_size(self, *, input_size, size, max_size): + if size is None: + size = max_size + + elif not (isinstance(size, int) or len(size) == 1): + return tuple(size) + + elif not isinstance(size, int): + size = size[0] + + old_height, old_width = input_size + ratio = old_width / old_height + if ratio > 1: + new_height = size + new_width = int(ratio * new_height) + else: + new_width = size + new_height = int(new_width / ratio) + + if max_size is not None and max(new_height, new_width) > max_size: + # Need to recompute the aspect ratio, since it might have changed due to rounding + ratio = new_width / new_height + if ratio > 1: + new_width = max_size + new_height = int(new_width / ratio) + else: + new_height = max_size + new_width = int(new_height * ratio) + + return new_height, new_width + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("antialias", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA. + # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that. + atol = 30 if (interpolation is transforms.InterpolationMode.BICUBIC and dtype is torch.uint8) else 1 + check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol) + + check_kernel( + F.resize_image, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + size=size, + interpolation=interpolation, + **max_size_kwarg, + antialias=antialias, + check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + + bounding_boxes = make_bounding_boxes( + format=format, + canvas_size=self.INPUT_SIZE, + dtype=dtype, + device=device, + ) + check_kernel( + F.resize_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + size=size, + **max_size_kwarg, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, size, use_max_size, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + keypoints = make_keypoints( + canvas_size=self.INPUT_SIZE, + dtype=dtype, + device=device, + ) + check_kernel( + F.resize_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + size=size, + **max_size_kwarg, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1]) + + def test_kernel_video(self): + check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, size, make_input): + max_size_kwarg = self._make_max_size_kwarg(use_max_size=size is None, size=size) + + check_functional( + F.resize, + make_input(self.INPUT_SIZE), + size=size, + **max_size_kwarg, + antialias=True, + check_scripted_smoke=not isinstance(size, int), + ) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.resize_image, torch.Tensor), + (F._geometry._resize_image_pil, PIL.Image.Image), + (F.resize_image, tv_tensors.Image), + (F.resize_mask, tv_tensors.Mask), + (F.resize_video, tv_tensors.Video), + (F.resize_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + make_keypoints, + ], + ) + def test_transform(self, size, device, make_input): + max_size_kwarg = self._make_max_size_kwarg(use_max_size=size is None, size=size) + + check_transform( + transforms.Resize(size=size, **max_size_kwarg, antialias=True), + make_input(self.INPUT_SIZE, device=device), + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + check_v1_compatibility=dict(rtol=0, atol=1) if size is not None else False, + ) + + def _check_output_size(self, input, output, *, size, max_size): + assert tuple(F.get_size(output)) == self._compute_output_size( + input_size=F.get_size(input), size=size, max_size=max_size + ) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. + # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_image_correctness(self, size, interpolation, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + + actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True) + expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg)) + + self._check_output_size(image, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected, atol=1, rtol=0) + + def _reference_resize_bounding_boxes(self, bounding_boxes, format, *, size, max_size=None): + old_height, old_width = bounding_boxes.canvas_size + new_height, new_width = self._compute_output_size( + input_size=bounding_boxes.canvas_size, size=size, max_size=max_size + ) + + if (old_height, old_width) == (new_height, new_width): + return bounding_boxes + + affine_matrix = np.array( + [ + [new_width / old_width, 0, 0], + [0, new_height / old_height, 0], + ], + ) + + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + + return helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=(new_height, new_width), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_bounding_boxes_correctness(self, format, size, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE) + + actual = fn(bounding_boxes, size=size, **max_size_kwarg) + expected = self._reference_resize_bounding_boxes(bounding_boxes, format=format, size=size, **max_size_kwarg) + + self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected) + + def _reference_resize_keypoints(self, keypoints, *, size, max_size=None): + old_height, old_width = keypoints.canvas_size + new_height, new_width = self._compute_output_size( + input_size=keypoints.canvas_size, size=size, max_size=max_size + ) + + if (old_height, old_width) == (new_height, new_width): + return keypoints + + affine_matrix = np.array( + [ + [new_width / old_width, 0, 0], + [0, new_height / old_height, 0], + ], + ) + + return reference_affine_keypoints_helper( + keypoints, + affine_matrix=affine_matrix, + new_canvas_size=(new_height, new_width), + ) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_keypoints_correctness(self, size, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + keypoints = make_keypoints(canvas_size=self.INPUT_SIZE) + + actual = fn(keypoints, size=size, **max_size_kwarg) + expected = self._reference_resize_keypoints(keypoints, size=size, **max_size_kwarg) + + self._check_output_size(keypoints, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES)) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_pil_interpolation_compat_smoke(self, interpolation, make_input): + input = make_input(self.INPUT_SIZE) + + with ( + contextlib.nullcontext() + if isinstance(input, PIL.Image.Image) + # This error is triggered in PyTorch core + else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}") + ): + F.resize( + input, + size=self.OUTPUT_SIZES[0], + interpolation=interpolation, + ) + + def test_functional_pil_antialias_warning(self): + with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"): + F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + make_keypoints, + ], + ) + def test_max_size_error(self, size, make_input): + if size is None: + # value can be anything other than an integer + max_size = None + match = "max_size must be an integer when size is None" + elif isinstance(size, int) or len(size) == 1: + max_size = (size if isinstance(size, int) else size[0]) - 1 + match = "must be strictly greater than the requested size" + else: + # value can be anything other than None + max_size = -1 + match = "size should be an int or a sequence of length 1" + + with pytest.raises(ValueError, match=match): + F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True) + + if isinstance(size, list) and len(size) != 1: + with pytest.raises(ValueError, match="max_size should only be passed if size is None or specifies"): + F.resize(make_input(self.INPUT_SIZE), size=size, max_size=500) + + @pytest.mark.parametrize( + "input_size, max_size, expected_size", + [ + ((10, 10), 10, (10, 10)), + ((10, 20), 40, (20, 40)), + ((20, 10), 40, (40, 20)), + ((10, 20), 10, (5, 10)), + ((20, 10), 10, (10, 5)), + ], + ) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + make_keypoints, + ], + ) + def test_resize_size_none(self, input_size, max_size, expected_size, make_input): + img = make_input(input_size) + out = F.resize(img, size=None, max_size=max_size) + assert F.get_size(out)[-2:] == list(expected_size) + + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_interpolation_int(self, interpolation, make_input): + input = make_input(self.INPUT_SIZE) + + # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to + # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a + # difference and thus we don't test it here. + if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT: + return + + expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True) + actual = F.resize( + input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True + ) + + assert_equal(actual, expected) + + def test_transform_unknown_size_error(self): + with pytest.raises(ValueError, match="size can be an integer, a sequence of one or two integers, or None"): + transforms.Resize(size=object()) + + @pytest.mark.parametrize( + "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)] + ) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + make_keypoints, + ], + ) + def test_noop(self, size, make_input): + input = make_input(self.INPUT_SIZE) + + output = F.resize(input, size=F.get_size(input), antialias=True) + + # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there + # is a good reason to break this, feel free to downgrade to an equality check. + if isinstance(input, tv_tensors.TVTensor): + # We can't test identity directly, since that checks for the identity of the Python object. Since all + # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check + # that the underlying storage is the same + assert output.data_ptr() == input.data_ptr() + else: + assert output is input + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + make_keypoints, + ], + ) + def test_no_regression_5405(self, make_input): + # Checks that `max_size` is not ignored if `size == small_edge_size` + # See https://github.com/pytorch/vision/issues/5405 + + input = make_input(self.INPUT_SIZE) + + size = min(F.get_size(input)) + max_size = size + 1 + output = F.resize(input, size=size, max_size=max_size, antialias=True) + + assert max(F.get_size(output)) == max_size + + def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs): + # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming + # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the + # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create + # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels + # last although PyTorch doesn't recognizes it as such. + emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1 + + image = make_image( + *args, + batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims, + memory_format=memory_format, + **kwargs, + ) + + if emulate_channels_last: + image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image) + + return image + + def _check_stride(self, image, *, memory_format): + C, H, W = F.get_dimensions(image) + if memory_format is torch.contiguous_format: + expected_stride = (H * W, W, 1) + elif memory_format is torch.channels_last: + expected_stride = (1, W * C, C) + else: + raise ValueError(f"Unknown memory_format: {memory_format}") + + assert image.stride() == expected_stride + + # TODO: We can remove this test and related torchvision workaround + # once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430 + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize("antialias", [True, False]) + @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device): + size = self.OUTPUT_SIZES[0] + + input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format) + + # Smoke test to make sure we aren't starting with wrong assumptions + self._check_stride(input, memory_format=memory_format) + + output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias) + + self._check_stride(output, memory_format=memory_format) + + def test_float16_no_rounding(self): + # Make sure Resize() doesn't round float16 images + # Non-regression test for https://github.com/pytorch/vision/issues/7667 + + input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16) + output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True) + + assert output.dtype is torch.float16 + assert (output.round() - output).abs().sum() > 0 + + +class TestHorizontalFlip: + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.horizontal_flip_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, dtype, device): + keypoints = make_keypoints(dtype=dtype, device=device) + check_kernel( + F.horizontal_flip_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.horizontal_flip_mask, make_mask()) + + def test_kernel_video(self): + check_kernel(F.horizontal_flip_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.horizontal_flip, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.horizontal_flip_image, torch.Tensor), + (F._geometry._horizontal_flip_image_pil, PIL.Image.Image), + (F.horizontal_flip_image, tv_tensors.Image), + (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes), + (F.horizontal_flip_mask, tv_tensors.Mask), + (F.horizontal_flip_video, tv_tensors.Video), + (F.horizontal_flip_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device)) + + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.horizontal_flip(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected) + + def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes): + affine_matrix = np.array( + [ + [-1, 0, bounding_boxes.canvas_size[1]], + [0, 1, 0], + ], + ) + + helper = ( + functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True) + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + return helper(bounding_boxes, affine_matrix=affine_matrix, clamp=False) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_bounding_boxes_correctness(self, format, fn): + bounding_boxes = make_bounding_boxes(format=format) + + actual = fn(bounding_boxes) + expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes) + + torch.testing.assert_close(actual, expected) + + def _reference_horizontal_flip_keypoints(self, keypoints): + affine_matrix = np.array( + [ + [-1, 0, keypoints.canvas_size[1] - 1], + [0, 1, 0], + ], + ) + + return reference_affine_keypoints_helper(keypoints, affine_matrix=affine_matrix) + + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_keypoints_correctness(self, fn): + keypoints = make_keypoints() + + actual = fn(keypoints) + expected = self._reference_horizontal_flip_keypoints(keypoints) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_noop(self, make_input, device): + input = make_input(device=device) + + transform = transforms.RandomHorizontalFlip(p=0) + + output = transform(input) + + assert_equal(output, input) + + +class TestAffine: + _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( + # float, int + angle=[-10.9, 18], + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)], + # float + scale=[0.5], + # float, int, + # one-list of float, one-list of int, one-tuple of float, one-tuple of int + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)], + # None + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], + ) + # The special case for shear makes sure we pick a value that is supported while JIT scripting + _MINIMAL_AFFINE_KWARGS = { + k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list)) + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + _CORRECTNESS_AFFINE_KWARGS = { + k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)] + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + + _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( + degrees=[30, (-15, 20)], + translate=[None, (0.5, 0.5)], + scale=[None, (0.75, 1.25)], + shear=[None, (12, 30, -17, 5), 10, (-5, 12)], + ) + _CORRECTNESS_TRANSFORM_AFFINE_RANGES = { + k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items() + } + + def _check_kernel(self, kernel, input, *args, **kwargs): + kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy() + kwargs_.update(kwargs) + check_kernel(kernel, input, *args, **kwargs_) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "fill": + value = adapt_fill(value, dtype=dtype) + self._check_kernel( + F.affine_image, + make_image(dtype=dtype, device=device), + **{param: value}, + check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))), + check_cuda_vs_cpu=( + dict(atol=1, rtol=0) + if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR + else True + ), + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + self._check_kernel( + F.affine_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + **{param: value}, + check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, param, value, dtype, device): + keypoints = make_keypoints(dtype=dtype, device=device) + self._check_kernel( + F.affine_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + **{param: value}, + check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + self._check_kernel(F.affine_mask, make_mask()) + + def test_kernel_video(self): + self._check_kernel(F.affine_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.affine_image, torch.Tensor), + (F._geometry._affine_image_pil, PIL.Image.Image), + (F.affine_image, tv_tensors.Image), + (F.affine_bounding_boxes, tv_tensors.BoundingBoxes), + (F.affine_mask, tv_tensors.Mask), + (F.affine_video, tv_tensors.Video), + (F.affine_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + input = make_input(device=device) + + check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = F.affine( + image, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + interpolation=interpolation, + fill=fill, + ) + expected = F.to_image( + F.affine( + F.to_pil_image(image), + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + interpolation=interpolation, + fill=fill, + ) + ) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, center, interpolation, fill, seed): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + transform = transforms.RandomAffine( + **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill + ) + + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 + + def _compute_affine_matrix(self, *, angle, translate, scale, shear, center): + rot = math.radians(angle) + cx, cy = center + tx, ty = translate + sx, sy = (math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear)) + + c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]]) + t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]]) + c_matrix_inv = np.linalg.inv(c_matrix) + rs_matrix = np.array( + [ + [scale * math.cos(rot), -scale * math.sin(rot), 0], + [scale * math.sin(rot), scale * math.cos(rot), 0], + [0, 0, 1], + ] + ) + shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]]) + shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]]) + rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix)) + true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) + return true_matrix[:2, :] + + def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center): + if center is None: + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] + + affine_matrix = self._compute_affine_matrix( + angle=angle, translate=translate, scale=scale, shear=shear, center=center + ) + + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + + return helper( + bounding_boxes, + affine_matrix=affine_matrix, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center): + bounding_boxes = make_bounding_boxes(format=format) + + actual = F.affine( + bounding_boxes, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + expected = self._reference_affine_bounding_boxes( + bounding_boxes, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + + torch.testing.assert_close(actual, expected, atol=1e-4, rtol=1e-4) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, format, center, seed): + bounding_boxes = make_bounding_boxes(format=format) + + transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center) + + torch.manual_seed(seed) + params = transform.make_params([bounding_boxes]) + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center) + + torch.testing.assert_close(actual, expected, atol=1e-5, rtol=2e-5) + + def _reference_affine_keypoints(self, keypoints, *, angle, translate, scale, shear, center): + if center is None: + center = [s * 0.5 for s in keypoints.canvas_size[::-1]] + + return reference_affine_keypoints_helper( + keypoints, + affine_matrix=self._compute_affine_matrix( + angle=angle, translate=translate, scale=scale, shear=shear, center=center + ), + ) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_keypoints_correctness(self, angle, translate, scale, shear, center): + keypoints = make_keypoints() + + actual = F.affine( + keypoints, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + expected = self._reference_affine_keypoints( + keypoints, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_keypoints_correctness(self, center, seed): + keypoints = make_keypoints() + + transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center) + + torch.manual_seed(seed) + params = transform.make_params([keypoints]) + + torch.manual_seed(seed) + actual = transform(keypoints) + + expected = self._reference_affine_keypoints(keypoints, **params, center=center) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) + @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"]) + @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"]) + @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transformmake_params_bounds(self, degrees, translate, scale, shear, seed): + image = make_image() + height, width = F.get_size(image) + + transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear) + + torch.manual_seed(seed) + params = transform.make_params([image]) + + if isinstance(degrees, (int, float)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + if translate is not None: + width_max = int(round(translate[0] * width)) + height_max = int(round(translate[1] * height)) + assert -width_max <= params["translate"][0] <= width_max + assert -height_max <= params["translate"][1] <= height_max + else: + assert params["translate"] == (0, 0) + + if scale is not None: + assert scale[0] <= params["scale"] <= scale[1] + else: + assert params["scale"] == 1.0 + + if shear is not None: + if isinstance(shear, (int, float)): + assert -shear <= params["shear"][0] <= shear + assert params["shear"][1] == 0.0 + elif len(shear) == 2: + assert shear[0] <= params["shear"][0] <= shear[1] + assert params["shear"][1] == 0.0 + elif len(shear) == 4: + assert shear[0] <= params["shear"][0] <= shear[1] + assert shear[2] <= params["shear"][1] <= shear[3] + else: + assert params["shear"] == (0, 0) + + @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"]) + @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) + def test_transform_sequence_len_errors(self, param, value): + if param in {"degrees", "shear"} and not isinstance(value, list): + return + + kwargs = {param: value} + if param != "degrees": + kwargs["degrees"] = 0 + + with pytest.raises( + ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" + ): + transforms.RandomAffine(**kwargs) + + def test_transform_negative_degrees_error(self): + with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): + transforms.RandomAffine(degrees=-1) + + @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]]) + def test_transform_translate_range_error(self, translate): + with pytest.raises(ValueError, match="translation values should be between 0 and 1"): + transforms.RandomAffine(degrees=0, translate=translate) + + @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]]) + def test_transform_scale_range_error(self, scale): + with pytest.raises(ValueError, match="scale values should be positive"): + transforms.RandomAffine(degrees=0, scale=scale) + + def test_transform_negative_shear_error(self): + with pytest.raises(ValueError, match="If shear is a single number, it must be positive"): + transforms.RandomAffine(degrees=0, shear=-1) + + def test_transform_unknown_fill_error(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(degrees=0, fill="fill") + + +class TestVerticalFlip: + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.vertical_flip_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, dtype, device): + keypoints = make_keypoints(dtype=dtype, device=device) + check_kernel( + F.vertical_flip_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.vertical_flip_mask, make_mask()) + + def test_kernel_video(self): + check_kernel(F.vertical_flip_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.vertical_flip, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.vertical_flip_image, torch.Tensor), + (F._geometry._vertical_flip_image_pil, PIL.Image.Image), + (F.vertical_flip_image, tv_tensors.Image), + (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes), + (F.vertical_flip_mask, tv_tensors.Mask), + (F.vertical_flip_video, tv_tensors.Video), + (F.vertical_flip_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device)) + + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.vertical_flip(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected) + + def _reference_vertical_flip_bounding_boxes(self, bounding_boxes: tv_tensors.BoundingBoxes): + affine_matrix = np.array( + [ + [1, 0, 0], + [0, -1, bounding_boxes.canvas_size[0]], + ], + ) + + helper = ( + functools.partial(reference_affine_rotated_bounding_boxes_helper, flip=True) + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + return helper(bounding_boxes, affine_matrix=affine_matrix, clamp=False) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_bounding_boxes_correctness(self, format, fn): + bounding_boxes = make_bounding_boxes(format=format) + + actual = fn(bounding_boxes) + expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes) + + torch.testing.assert_close(actual, expected) + + def _reference_vertical_flip_keypoints(self, keypoints): + affine_matrix = np.array( + [ + [1, 0, 0], + [0, -1, keypoints.canvas_size[0] - 1], + ], + ) + + return reference_affine_keypoints_helper(keypoints, affine_matrix=affine_matrix) + + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_keypoints_correctness(self, fn): + keypoints = make_keypoints() + + actual = fn(keypoints) + expected = self._reference_vertical_flip_keypoints(keypoints) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_noop(self, make_input, device): + input = make_input(device=device) + + transform = transforms.RandomVerticalFlip(p=0) + + output = transform(input) + + assert_equal(output, input) + + +class TestRotate: + _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( + # float, int + angle=[-10.9, 18], + # None + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], + ) + _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()} + _CORRECTNESS_AFFINE_KWARGS = { + k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)] + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + + _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( + degrees=[30, (-15, 20)], + ) + _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()} + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + check_kernel( + F.rotate_image, + make_image(dtype=dtype, device=device), + **kwargs, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + if tv_tensors.is_rotated_bounding_format(format): + # TODO there is a 1e-6 difference between GPU and CPU outputs + # due to clamping. To avoid failing this test, we do clamp before hand. + bounding_boxes = F.clamp_bounding_boxes(bounding_boxes) + + check_kernel( + F.rotate_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + **kwargs, + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, param, value, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + + keypoints = make_keypoints(dtype=dtype, device=device) + + check_kernel( + F.rotate_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + **kwargs, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS) + + def test_kernel_video(self): + check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.rotate_image, torch.Tensor), + (F._geometry._rotate_image_pil, PIL.Image.Image), + (F.rotate_image, tv_tensors.Image), + (F.rotate_mask, tv_tensors.Mask), + (F.rotate_video, tv_tensors.Video), + (F.rotate_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform( + transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device) + ) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_functional_image_correctness(self, angle, center, interpolation, expand, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill) + expected = F.to_image( + F.rotate( + F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill + ) + ) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, center, interpolation, expand, fill, seed): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + transform = transforms.RandomRotation( + **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, + center=center, + interpolation=interpolation, + expand=expand, + fill=fill, + ) + + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 + + def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix): + if not expand: + return canvas_size, (0.0, 0.0) + + input_height, input_width = canvas_size + + input_image_frame = np.array( + [ + [0.0, 0.0, 1.0], + [0.0, input_height, 1.0], + [input_width, input_height, 1.0], + [input_width, 0.0, 1.0], + ], + dtype=np.float64, + ) + output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T) + + recenter_x = float(np.min(output_image_frame[:, 0])) + recenter_y = float(np.min(output_image_frame[:, 1])) + + output_width = int(np.max(output_image_frame[:, 0]) - recenter_x) + output_height = int(np.max(output_image_frame[:, 1]) - recenter_y) + + return (output_height, output_width), (recenter_x, recenter_y) + + def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): + x, y = recenter_xy + if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY: + translate = [x, y, x, y] + elif bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXYXYXY: + translate = [x, y, x, y, x, y, x, y] + elif ( + bounding_boxes.format is tv_tensors.BoundingBoxFormat.CXCYWHR + or bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYWHR + ): + translate = [x, y, 0.0, 0.0, 0.0] + else: + translate = [x, y, 0.0, 0.0] + return tv_tensors.wrap( + (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes + ) + + def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): + if center is None: + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] + cx, cy = center + + a = np.cos(angle * np.pi / 180.0) + b = np.sin(angle * np.pi / 180.0) + affine_matrix = np.array( + [ + [a, b, cx - cx * a - b * cy], + [-b, a, cy + cx * b - a * cy], + ], + ) + + new_canvas_size, recenter_xy = self._compute_output_canvas_size( + expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix + ) + + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + output = helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=new_canvas_size, + clamp=False, + ) + + return self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy).to(bounding_boxes) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_bounding_boxes_correctness(self, format, angle, expand, center): + bounding_boxes = make_bounding_boxes(format=format, clamping_mode=None) + + actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) + expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, format, expand, center, seed): + bounding_boxes = make_bounding_boxes(format=format, clamping_mode=None) + + transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) + + torch.manual_seed(seed) + params = transform.make_params([bounding_boxes]) + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + torch.testing.assert_close(actual, expected) + + def _recenter_keypoints_after_expand(self, keypoints, *, recenter_xy): + x, y = recenter_xy + translate = [x, y] + return tv_tensors.wrap( + (keypoints.to(torch.float64) - torch.tensor(translate)).to(keypoints.dtype), like=keypoints + ) + + def _reference_rotate_keypoints(self, keypoints, *, angle, expand, center): + if center is None: + center = [s * 0.5 for s in keypoints.canvas_size[::-1]] + cx, cy = center + + a = np.cos(angle * np.pi / 180.0) + b = np.sin(angle * np.pi / 180.0) + affine_matrix = np.array( + [ + [a, b, cx - cx * a - b * cy], + [-b, a, cy + cx * b - a * cy], + ], + ) + + new_canvas_size, recenter_xy = self._compute_output_canvas_size( + expand=expand, canvas_size=keypoints.canvas_size, affine_matrix=affine_matrix + ) + + output = reference_affine_keypoints_helper( + keypoints, + affine_matrix=affine_matrix, + new_canvas_size=new_canvas_size, + clamp=False, + ) + + return F.clamp_keypoints(self._recenter_keypoints_after_expand(output, recenter_xy=recenter_xy)).to(keypoints) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_keypoints_correctness(self, angle, expand, center): + keypoints = make_keypoints() + + actual = F.rotate(keypoints, angle=angle, expand=expand, center=center) + expected = self._reference_rotate_keypoints(keypoints, angle=angle, expand=expand, center=center) + + torch.testing.assert_close(actual, expected) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_keypoints_correctness(self, expand, center, seed): + keypoints = make_keypoints() + + transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) + + torch.manual_seed(seed) + params = transform.make_params([keypoints]) + + torch.manual_seed(seed) + actual = transform(keypoints) + + expected = self._reference_rotate_keypoints(keypoints, **params, expand=expand, center=center) + + torch.testing.assert_close(actual, expected) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transformmake_params_bounds(self, degrees, seed): + transform = transforms.RandomRotation(degrees=degrees) + + torch.manual_seed(seed) + params = transform.make_params([]) + + if isinstance(degrees, (int, float)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + @pytest.mark.parametrize("param", ["degrees", "center"]) + @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) + def test_transform_sequence_len_errors(self, param, value): + if param == "degrees" and not isinstance(value, list): + return + + kwargs = {param: value} + if param != "degrees": + kwargs["degrees"] = 0 + + with pytest.raises( + ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" + ): + transforms.RandomRotation(**kwargs) + + def test_transform_negative_degrees_error(self): + with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): + transforms.RandomAffine(degrees=-1) + + def test_transform_unknown_fill_error(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(degrees=0, fill="fill") + + @pytest.mark.parametrize("size", [(11, 17), (16, 16)]) + @pytest.mark.parametrize("angle", [0, 90, 180, 270]) + @pytest.mark.parametrize("expand", [False, True]) + def test_functional_image_fast_path_correctness(self, size, angle, expand): + image = make_image(size, dtype=torch.uint8, device="cpu") + + actual = F.rotate(image, angle=angle, expand=expand) + expected = F.to_image(F.rotate(F.to_pil_image(image), angle=angle, expand=expand)) + + torch.testing.assert_close(actual, expected) + + +class TestContainerTransforms: + class BuiltinTransform(transforms.Transform): + def transform(self, inpt, params): + return inpt + + class PackedInputTransform(nn.Module): + def forward(self, sample): + assert len(sample) == 2 + return sample + + class UnpackedInputTransform(nn.Module): + def forward(self, image, label): + return image, label + + @pytest.mark.parametrize( + "transform_cls", [transforms.Compose, functools.partial(transforms.RandomApply, p=1), transforms.RandomOrder] + ) + @pytest.mark.parametrize( + "wrapped_transform_clss", + [ + [BuiltinTransform], + [PackedInputTransform], + [UnpackedInputTransform], + [BuiltinTransform, BuiltinTransform], + [PackedInputTransform, PackedInputTransform], + [UnpackedInputTransform, UnpackedInputTransform], + [BuiltinTransform, PackedInputTransform, BuiltinTransform], + [BuiltinTransform, UnpackedInputTransform, BuiltinTransform], + [PackedInputTransform, BuiltinTransform, PackedInputTransform], + [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform], + ], + ) + @pytest.mark.parametrize("unpack", [True, False]) + def test_packed_unpacked(self, transform_cls, wrapped_transform_clss, unpack): + needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in wrapped_transform_clss) + needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in wrapped_transform_clss) + assert not (needs_packed_inputs and needs_unpacked_inputs) + + transform = transform_cls([cls() for cls in wrapped_transform_clss]) + + image = make_image() + label = 3 + packed_input = (image, label) + + def call_transform(): + if unpack: + return transform(*packed_input) + else: + return transform(packed_input) + + if needs_unpacked_inputs and not unpack: + with pytest.raises(TypeError, match="missing 1 required positional argument"): + call_transform() + elif needs_packed_inputs and unpack: + with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"): + call_transform() + else: + output = call_transform() + + assert isinstance(output, tuple) and len(output) == 2 + assert output[0] is image + assert output[1] is label + + def test_compose(self): + transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ] + ) + + input = make_image() + + actual = check_transform(transform, input) + expected = F.vertical_flip(F.horizontal_flip(input)) + + assert_equal(actual, expected) + + @pytest.mark.parametrize("p", [0.0, 1.0]) + @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList]) + def test_random_apply(self, p, sequence_type): + transform = transforms.RandomApply( + sequence_type( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ] + ), + p=p, + ) + + # This needs to be a pure tensor (or a PIL image), because otherwise check_transforms skips the v1 compatibility + # check + input = make_image_tensor() + output = check_transform(transform, input, check_v1_compatibility=issubclass(sequence_type, nn.ModuleList)) + + if p == 1: + assert_equal(output, F.vertical_flip(F.horizontal_flip(input))) + else: + assert output is input + + @pytest.mark.parametrize("p", [(0, 1), (1, 0)]) + def test_random_choice(self, p): + transform = transforms.RandomChoice( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ], + p=p, + ) + + input = make_image() + output = check_transform(transform, input) + + p_horz, p_vert = p + if p_horz: + assert_equal(output, F.horizontal_flip(input)) + else: + assert_equal(output, F.vertical_flip(input)) + + def test_random_order(self): + transform = transforms.Compose( + [ + transforms.RandomHorizontalFlip(p=1), + transforms.RandomVerticalFlip(p=1), + ] + ) + + input = make_image() + + actual = check_transform(transform, input) + # We can't really check whether the transforms are actually applied in random order. However, horizontal and + # vertical flip are commutative. Meaning, even under the assumption that the transform applies them in random + # order, we can use a fixed order to compute the expected value. + expected = F.vertical_flip(F.horizontal_flip(input)) + + assert_equal(actual, expected) + + def test_errors(self): + for cls in [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]: + with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"): + cls(lambda x: x) + + for cls in ( + transforms.Compose, + transforms.RandomApply, + transforms.RandomChoice, + transforms.RandomOrder, + ): + + with pytest.raises(ValueError, match="at least one transform"): + cls([]) + + for p in [-1, 2]: + with pytest.raises(ValueError, match=re.escape("value in the interval [0.0, 1.0]")): + transforms.RandomApply([lambda x: x], p=p) + + for transforms_, p in [ + ([lambda x: x], []), + ( + [lambda x: x, lambda x: x], + [ + 1.0, + ], + ), + ]: + with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"): + transforms.RandomChoice(transforms_, p=p) + + +class TestToDtype: + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.to_dtype_image, make_image_tensor), + (F.to_dtype_image, make_image), + (F.to_dtype_video, make_video), + ], + ) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale): + check_kernel( + kernel, + make_input(dtype=input_dtype, device=device), + dtype=output_dtype, + scale=scale, + ) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_functional(self, make_input, input_dtype, output_dtype, device, scale): + check_functional( + F.to_dtype, + make_input(dtype=input_dtype, device=device), + dtype=output_dtype, + scale=scale, + ) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + @pytest.mark.parametrize("as_dict", (True, False)) + def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict): + input = make_input(dtype=input_dtype, device=device) + if as_dict: + output_dtype = {type(input): output_dtype} + check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input, check_sample_input=not as_dict) + + def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False): + input_dtype = image.dtype + output_dtype = dtype + + if not scale: + return image.to(dtype) + + if output_dtype == input_dtype: + return image + + def fn(value): + if input_dtype.is_floating_point: + if output_dtype.is_floating_point: + return value + else: + return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max) + else: + input_max_value = torch.iinfo(input_dtype).max + + if output_dtype.is_floating_point: + return float(decimal.Decimal(value) / input_max_value) + else: + output_max_value = torch.iinfo(output_dtype).max + + if input_max_value > output_max_value: + factor = (input_max_value + 1) // (output_max_value + 1) + return value / factor + else: + factor = (output_max_value + 1) // (input_max_value + 1) + return value * factor + + return torch.tensor(tree_map(fn, image.tolist())).to(dtype=output_dtype, device=image.device) + + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8, torch.uint16]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8, torch.uint16]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_image_correctness(self, input_dtype, output_dtype, device, scale): + if input_dtype.is_floating_point and output_dtype == torch.int64: + pytest.xfail("float to int64 conversion is not supported") + if input_dtype == torch.uint8 and output_dtype == torch.uint16 and device == "cuda": + pytest.xfail("uint8 to uint16 conversion is not supported on cuda") + + input = make_image(dtype=input_dtype, device=device) + + out = F.to_dtype(input, dtype=output_dtype, scale=scale) + expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale) + + if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale: + torch.testing.assert_close(out, expected, atol=1, rtol=0) + else: + torch.testing.assert_close(out, expected) + + def was_scaled(self, inpt): + # this assumes the target dtype is float + return inpt.max() <= 1 + + def make_inpt_with_bbox_and_mask(self, make_input): + H, W = 10, 10 + inpt_dtype = torch.uint8 + bbox_dtype = torch.float32 + mask_dtype = torch.bool + sample = { + "inpt": make_input(size=(H, W), dtype=inpt_dtype), + "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype), + "mask": make_detection_masks(size=(H, W), dtype=mask_dtype), + } + + return sample, inpt_dtype, bbox_dtype, mask_dtype + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + @pytest.mark.parametrize("scale", (True, False)) + def test_dtype_not_a_dict(self, make_input, scale): + # assert only inpt gets transformed when dtype isn't a dict + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample) + + assert out["inpt"].dtype != inpt_dtype + assert out["inpt"].dtype == torch.float32 + if scale: + assert self.was_scaled(out["inpt"]) + else: + assert not self.was_scaled(out["inpt"]) + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype == mask_dtype + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_others_catch_all_and_none(self, make_input): + # make sure "others" works as a catch-all and that None means no conversion + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample) + assert out["inpt"].dtype == inpt_dtype + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype != mask_dtype + assert out["mask"].dtype == torch.int64 + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_typical_use_case(self, make_input): + # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks. + # This just makes sure we now have a decent API for this + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype( + dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True + )(sample) + assert out["inpt"].dtype != inpt_dtype + assert out["inpt"].dtype == torch.float32 + assert self.was_scaled(out["inpt"]) + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype != mask_dtype + assert out["mask"].dtype == torch.int64 + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_errors_warnings(self, make_input): + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + + with pytest.raises(ValueError, match="No dtype was specified for"): + out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample) + with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")): + transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32}) + with pytest.warns(UserWarning, match="no scaling will be done"): + out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample) + assert out["inpt"].dtype == inpt_dtype + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype == mask_dtype + + def test_uint16(self): + # These checks are probably already covered above but since uint16 is a + # newly supported dtype, we want to be extra careful, hence this + # explicit test + img_uint16 = torch.randint(0, 65535, (256, 512), dtype=torch.uint16) + + img_uint8 = F.to_dtype(img_uint16, torch.uint8, scale=True) + img_float32 = F.to_dtype(img_uint16, torch.float32, scale=True) + img_int32 = F.to_dtype(img_uint16, torch.int32, scale=True) + + assert_equal(img_uint8, (img_uint16 / 256).to(torch.uint8)) + assert_close(img_float32, (img_uint16 / 65535)) + + assert_close(F.to_dtype(img_float32, torch.uint16, scale=True), img_uint16, rtol=0, atol=1) + # Ideally we'd check against (img_uint16 & 0xFF00) but bitwise and isn't supported for it yet + # so we simulate it by scaling down and up again. + assert_equal(F.to_dtype(img_uint8, torch.uint16, scale=True), ((img_uint16 / 256).to(torch.uint16) * 256)) + assert_equal(F.to_dtype(img_int32, torch.uint16, scale=True), img_uint16) + + assert_equal(F.to_dtype(img_float32, torch.uint8, scale=True), img_uint8) + assert_close(F.to_dtype(img_uint8, torch.float32, scale=True), img_float32, rtol=0, atol=1e-2) + + +class TestAdjustBrightness: + _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0] + _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.adjust_brightness_image, make_image), + (F.adjust_brightness_video, make_video), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, kernel, make_input, dtype, device): + check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_brightness_image, torch.Tensor), + (F._color._adjust_brightness_image_pil, PIL.Image.Image), + (F.adjust_brightness_image, tv_tensors.Image), + (F.adjust_brightness_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS) + def test_image_correctness(self, brightness_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_brightness(image, brightness_factor=brightness_factor) + expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor)) + + torch.testing.assert_close(actual, expected) + + +class TestCutMixMixUp: + class DummyDataset: + def __init__(self, size, num_classes, one_hot_labels): + self.size = size + self.num_classes = num_classes + self.one_hot_labels = one_hot_labels + assert size < num_classes + + def __getitem__(self, idx): + img = torch.rand(3, 100, 100) + label = idx # This ensures all labels in a batch are unique and makes testing easier + if self.one_hot_labels: + label = torch.nn.functional.one_hot(torch.tensor(label), num_classes=self.num_classes) + return img, label + + def __len__(self): + return self.size + + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + @pytest.mark.parametrize("one_hot_labels", (True, False)) + def test_supported_input_structure(self, T, one_hot_labels): + + batch_size = 32 + num_classes = 100 + + dataset = self.DummyDataset(size=batch_size, num_classes=num_classes, one_hot_labels=one_hot_labels) + + cutmix_mixup = T(num_classes=num_classes) + + dl = DataLoader(dataset, batch_size=batch_size) + + # Input sanity checks + img, target = next(iter(dl)) + input_img_size = img.shape[-3:] + assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor) + assert target.shape == (batch_size, num_classes) if one_hot_labels else (batch_size,) + + def check_output(img, target): + assert img.shape == (batch_size, *input_img_size) + assert target.shape == (batch_size, num_classes) + torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size)) + num_non_zero_labels = (target != 0).sum(axis=-1) + assert (num_non_zero_labels == 2).all() + + # After Dataloader, as unpacked input + img, target = next(iter(dl)) + assert target.shape == (batch_size, num_classes) if one_hot_labels else (batch_size,) + img, target = cutmix_mixup(img, target) + check_output(img, target) + + # After Dataloader, as packed input + packed_from_dl = next(iter(dl)) + assert isinstance(packed_from_dl, list) + img, target = cutmix_mixup(packed_from_dl) + check_output(img, target) + + # As collation function. We expect default_collate to be used by users. + def collate_fn_1(batch): + return cutmix_mixup(default_collate(batch)) + + def collate_fn_2(batch): + return cutmix_mixup(*default_collate(batch)) + + for collate_fn in (collate_fn_1, collate_fn_2): + dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) + img, target = next(iter(dl)) + check_output(img, target) + + @needs_cuda + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_cpu_vs_gpu(self, T): + num_classes = 10 + batch_size = 3 + H, W = 12, 12 + + imgs = torch.rand(batch_size, 3, H, W) + labels = torch.randint(0, num_classes, (batch_size,)) + cutmix_mixup = T(alpha=0.5, num_classes=num_classes) + + _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None) + + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_error(self, T): + + num_classes = 10 + batch_size = 9 + + imgs = torch.rand(batch_size, 3, 12, 12) + cutmix_mixup = T(alpha=0.5, num_classes=num_classes) + + for input_with_bad_type in ( + F.to_pil_image(imgs[0]), + tv_tensors.Mask(torch.rand(12, 12)), + tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), + tv_tensors.KeyPoints(torch.rand(2, 2), canvas_size=(12, 12)), + ): + print(type(input_with_bad_type), cutmix_mixup) + with pytest.raises(ValueError, match="does not support PIL images, "): + cutmix_mixup(input_with_bad_type) + + with pytest.raises(ValueError, match="Could not infer where the labels are"): + cutmix_mixup({"img": imgs, "Nothing_else": 3}) + + with pytest.raises(ValueError, match="labels should be index based"): + # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label + # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently + cutmix_mixup(imgs) + + with pytest.raises(ValueError, match="When using the default labels_getter"): + cutmix_mixup(imgs, "not_a_tensor") + + with pytest.raises(ValueError, match="Expected a batched input with 4 dims"): + cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,))) + + with pytest.raises(ValueError, match="does not match the batch size of the labels"): + cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,))) + + with pytest.raises(ValueError, match="When passing 2D labels"): + wrong_num_classes = num_classes + 1 + T(alpha=0.5, num_classes=num_classes)(imgs, torch.randint(0, 2, size=(batch_size, wrong_num_classes))) + + with pytest.raises(ValueError, match="but got a tensor of shape"): + cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3, 4))) + + with pytest.raises(ValueError, match="num_classes must be passed"): + T(alpha=0.5)(imgs, torch.randint(0, num_classes, size=(batch_size,))) + + +@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) +@pytest.mark.parametrize("sample_type", (tuple, list, dict)) +def test_labels_getter_default_heuristic(key, sample_type): + labels = torch.arange(10) + sample = {key: labels, "another_key": "whatever"} + if sample_type is not dict: + sample = sample_type((None, sample, "whatever_again")) + assert transforms._utils._find_labels_default_heuristic(sample) is labels + + if key.lower() != "labels": + # If "labels" is in the dict (case-insensitive), + # it takes precedence over other keys which would otherwise be a match + d = {key: "something_else", "labels": labels} + assert transforms._utils._find_labels_default_heuristic(d) is labels + + +class TestShapeGetters: + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_dimensions_image, make_image_tensor), + (F._meta._get_dimensions_image_pil, make_image_pil), + (F.get_dimensions_image, make_image), + (F.get_dimensions_video, make_video), + ], + ) + def test_get_dimensions(self, kernel, make_input): + size = (10, 10) + color_space, num_channels = "RGB", 3 + + input = make_input(size, color_space=color_space) + + assert kernel(input) == F.get_dimensions(input) == [num_channels, *size] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_num_channels_image, make_image_tensor), + (F._meta._get_num_channels_image_pil, make_image_pil), + (F.get_num_channels_image, make_image), + (F.get_num_channels_video, make_video), + ], + ) + def test_get_num_channels(self, kernel, make_input): + color_space, num_channels = "RGB", 3 + + input = make_input(color_space=color_space) + + assert kernel(input) == F.get_num_channels(input) == num_channels + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_size_image, make_image_tensor), + (F._meta._get_size_image_pil, make_image_pil), + (F.get_size_image, make_image), + (F.get_size_bounding_boxes, make_bounding_boxes), + (F.get_size_keypoints, make_keypoints), + (F.get_size_mask, make_detection_masks), + (F.get_size_mask, make_segmentation_mask), + (F.get_size_video, make_video), + ], + ) + def test_get_size(self, kernel, make_input): + size = (10, 10) + + input = make_input(size) + + assert kernel(input) == F.get_size(input) == list(size) + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_num_frames_video, make_video_tensor), + (F.get_num_frames_video, make_video), + ], + ) + def test_get_num_frames(self, kernel, make_input): + num_frames = 4 + + input = make_input(num_frames=num_frames) + + assert kernel(input) == F.get_num_frames(input) == num_frames + + @pytest.mark.parametrize( + ("functional", "make_input"), + [ + (F.get_dimensions, make_bounding_boxes), + (F.get_dimensions, make_detection_masks), + (F.get_dimensions, make_segmentation_mask), + (F.get_num_channels, make_bounding_boxes), + (F.get_num_channels, make_detection_masks), + (F.get_num_channels, make_segmentation_mask), + (F.get_num_frames, make_image_pil), + (F.get_num_frames, make_image), + (F.get_num_frames, make_bounding_boxes), + (F.get_num_frames, make_detection_masks), + (F.get_num_frames, make_segmentation_mask), + ], + ) + def test_unsupported_types(self, functional, make_input): + input = make_input() + + with pytest.raises(TypeError, match=re.escape(str(type(input)))): + functional(input) + + +class TestRegisterKernel: + @pytest.mark.parametrize("functional", (F.resize, "resize")) + def test_register_kernel(self, functional): + class CustomTVTensor(tv_tensors.TVTensor): + pass + + kernel_was_called = False + + @F.register_kernel(functional, CustomTVTensor) + def new_resize(dp, *args, **kwargs): + nonlocal kernel_was_called + kernel_was_called = True + return dp + + t = transforms.Resize(size=(224, 224), antialias=True) + + my_dp = CustomTVTensor(torch.rand(3, 10, 10)) + out = t(my_dp) + assert out is my_dp + assert kernel_was_called + + # Sanity check to make sure we didn't override the kernel of other types + t(torch.rand(3, 10, 10)).shape == (3, 224, 224) + t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224) + + def test_errors(self): + with pytest.raises(ValueError, match="Could not find functional with name"): + F.register_kernel("bad_name", tv_tensors.Image) + + with pytest.raises(ValueError, match="Kernels can only be registered on functionals"): + F.register_kernel(tv_tensors.Image, F.resize) + + with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"): + F.register_kernel(F.resize, object) + + with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"): + F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image) + + class CustomTVTensor(tv_tensors.TVTensor): + pass + + def resize_custom_tv_tensor(): + pass + + F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) + + with pytest.raises(ValueError, match="already has a kernel registered for type"): + F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) + + +class TestGetKernel: + # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination + # would also be fine + KERNELS = { + torch.Tensor: F.resize_image, + PIL.Image.Image: F._geometry._resize_image_pil, + tv_tensors.Image: F.resize_image, + tv_tensors.BoundingBoxes: F.resize_bounding_boxes, + tv_tensors.Mask: F.resize_mask, + tv_tensors.Video: F.resize_video, + } + + @pytest.mark.parametrize("input_type", [str, int, object]) + def test_unsupported_types(self, input_type): + with pytest.raises(TypeError, match="supports inputs of type"): + _get_kernel(F.resize, input_type) + + def test_exact_match(self): + # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the + # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional + # here, register the kernels without wrapper, and check the exact matching afterwards. + def resize_with_pure_kernels(): + pass + + for input_type, kernel in self.KERNELS.items(): + _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel) + + assert _get_kernel(resize_with_pure_kernels, input_type) is kernel + + def test_builtin_tv_tensor_subclass(self): + # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the + # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional + # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched + # to the kernel of the corresponding superclass + def resize_with_pure_kernels(): + pass + + class MyImage(tv_tensors.Image): + pass + + class MyBoundingBoxes(tv_tensors.BoundingBoxes): + pass + + class MyMask(tv_tensors.Mask): + pass + + class MyVideo(tv_tensors.Video): + pass + + for custom_tv_tensor_subclass in [ + MyImage, + MyBoundingBoxes, + MyMask, + MyVideo, + ]: + builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1] + builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class] + _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)( + builtin_tv_tensor_kernel + ) + + assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel + + def test_tv_tensor_subclass(self): + class MyTVTensor(tv_tensors.TVTensor): + pass + + with pytest.raises(TypeError, match="supports inputs of type"): + _get_kernel(F.resize, MyTVTensor) + + def resize_my_tv_tensor(): + pass + + _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor) + + assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor + + def test_pil_image_subclass(self): + opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") + loaded_image = opened_image.convert("RGB") + + # check the assumptions + assert isinstance(opened_image, PIL.Image.Image) + assert type(opened_image) is not PIL.Image.Image + + assert type(loaded_image) is PIL.Image.Image + + size = [17, 11] + for image in [opened_image, loaded_image]: + kernel = _get_kernel(F.resize, type(image)) + + output = kernel(image, size=size) + + assert F.get_size(output) == size + + +class TestPermuteChannels: + _DEFAULT_PERMUTATION = [2, 0, 1] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.permute_channels_image, make_image_tensor), + # FIXME + # check_kernel does not support PIL kernel, but it should + (F.permute_channels_image, make_image), + (F.permute_channels_video, make_video), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, kernel, make_input, dtype, device): + check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.permute_channels_image, torch.Tensor), + (F._color._permute_channels_image_pil, PIL.Image.Image), + (F.permute_channels_image, tv_tensors.Image), + (F.permute_channels_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type) + + def reference_image_correctness(self, image, permutation): + channel_images = image.split(1, dim=-3) + permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation] + return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3)) + + @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]]) + @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)]) + def test_image_correctness(self, permutation, batch_dims): + image = make_image(batch_dims=batch_dims) + + actual = F.permute_channels(image, permutation=permutation) + expected = self.reference_image_correctness(image, permutation=permutation) + + torch.testing.assert_close(actual, expected) + + +class TestElastic: + def _make_displacement(self, inpt): + return torch.rand( + 1, + *F.get_size(inpt), + 2, + dtype=torch.float32, + device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu", + ) + + @param_value_parametrization( + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8, torch.float16]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + image = make_image_tensor(dtype=dtype, device=device) + + check_kernel( + F.elastic_image, + image, + displacement=self._make_displacement(image), + **{param: value}, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + check_cuda_vs_cpu=dtype is not torch.float16, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + check_kernel( + F.elastic_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + displacement=self._make_displacement(bounding_boxes), + ) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, dtype, device): + keypoints = make_keypoints(dtype=dtype, device=device) + + check_kernel( + F.elastic_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + displacement=self._make_displacement(keypoints), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + mask = make_mask() + check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask)) + + def test_kernel_video(self): + video = make_video() + check_kernel(F.elastic_video, video, displacement=self._make_displacement(video)) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + input = make_input() + check_functional(F.elastic, input, displacement=self._make_displacement(input)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.elastic_image, torch.Tensor), + (F._geometry._elastic_image_pil, PIL.Image.Image), + (F.elastic_image, tv_tensors.Image), + (F.elastic_mask, tv_tensors.Mask), + (F.elastic_video, tv_tensors.Video), + (F.elastic_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_displacement_error(self, make_input): + input = make_input() + + with pytest.raises(TypeError, match="displacement should be a Tensor"): + F.elastic(input, displacement=None) + + with pytest.raises(ValueError, match="displacement shape should be"): + F.elastic(input, displacement=torch.rand(F.get_size(input))) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image + @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, size, device): + # We have to skip that test on M1 because it's flaky: Mismatched elements: 35 / 89205 (0.0%) + # See https://github.com/pytorch/vision/issues/8154 + # All other platforms are fine, so the differences do not come from something we own in torchvision + check_v1_compatibility = False if sys.platform == "darwin" else dict(rtol=0, atol=1) + + check_transform( + transforms.ElasticTransform(), + make_input(size, device=device), + check_v1_compatibility=check_v1_compatibility, + ) + + +class TestToPureTensor: + def test_correctness(self): + input = { + "img": make_image(), + "img_tensor": make_image_tensor(), + "img_pil": make_image_pil(), + "mask": make_detection_masks(), + "video": make_video(), + "bbox": make_bounding_boxes(), + "str": "str", + } + + out = transforms.ToPureTensor()(input) + + for input_value, out_value in zip(input.values(), out.values()): + if isinstance(input_value, tv_tensors.TVTensor): + assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor) + else: + assert isinstance(out_value, type(input_value)) + + +class TestCrop: + INPUT_SIZE = (21, 11) + + CORRECTNESS_CROP_KWARGS = [ + # center + dict(top=5, left=5, height=10, width=5), + # larger than input, i.e. pad + dict(top=-5, left=-5, height=30, width=20), + # sides: left, right, top, bottom + dict(top=-5, left=-5, height=30, width=10), + dict(top=-5, left=5, height=30, width=10), + dict(top=-5, left=-5, height=20, width=20), + dict(top=5, left=-5, height=20, width=20), + # corners: top-left, top-right, bottom-left, bottom-right + dict(top=-5, left=-5, height=20, width=10), + dict(top=-5, left=5, height=20, width=10), + dict(top=5, left=-5, height=20, width=10), + dict(top=5, left=5, height=20, width=10), + ] + MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0] + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, kwargs, dtype, device): + check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, kwargs, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_keypoints(self, kwargs, dtype, device): + keypoints = make_keypoints(self.INPUT_SIZE, dtype=dtype, device=device) + check_kernel(F.crop_keypoints, keypoints, **kwargs) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + def test_kernel_video(self): + check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.crop_image, torch.Tensor), + (F._geometry._crop_image_pil, PIL.Image.Image), + (F.crop_image, tv_tensors.Image), + (F.crop_bounding_boxes, tv_tensors.BoundingBoxes), + (F.crop_mask, tv_tensors.Mask), + (F.crop_video, tv_tensors.Video), + (F.crop_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + def test_functional_image_correctness(self, kwargs): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = F.crop(image, **kwargs) + expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs)) + + assert_equal(actual, expected) + + @param_value_parametrization( + size=[(10, 5), (25, 15), (25, 5), (10, 15)], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_transform(self, param, value, make_input): + input = make_input(self.INPUT_SIZE) + + check_sample_input = True + if param == "fill": + if isinstance(value, (tuple, list)): + if isinstance(input, tv_tensors.Mask): + pytest.skip("F.pad_mask doesn't support non-scalar fill.") + else: + check_sample_input = False + + kwargs = dict( + # 1. size is required + # 2. the fill parameter only has an affect if we need padding + size=[s + 4 for s in self.INPUT_SIZE], + fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8), + ) + else: + kwargs = {param: value} + + check_transform( + transforms.RandomCrop(**kwargs, pad_if_needed=True), + input, + check_v1_compatibility=param != "fill" or isinstance(value, (int, float)), + check_sample_input=check_sample_input, + ) + + @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)]) + def test_transform_padding(self, padding): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s + 2 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, padding=padding) + + output = transform(inpt) + + assert F.get_size(output) == output_size + + @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)]) + def test_transform_insufficient_padding(self, padding): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s + 3 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, padding=padding) + + with pytest.raises(ValueError, match="larger than (padded )?input image size"): + transform(inpt) + + def test_transform_pad_if_needed(self): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s * 2 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, pad_if_needed=True) + + output = transform(inpt) + + assert F.get_size(output) == output_size + + @param_value_parametrization( + size=[(10, 5), (25, 15), (25, 5), (10, 15)], + fill=CORRECTNESS_FILLS, + padding_mode=["constant", "edge", "reflect", "symmetric"], + ) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, param, value, seed): + kwargs = {param: value} + if param != "size": + # 1. size is required + # 2. the fill / padding_mode parameters only have an affect if we need padding + kwargs["size"] = [s + 4 for s in self.INPUT_SIZE] + if param == "fill": + kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8) + + transform = transforms.RandomCrop(pad_if_needed=True, **kwargs) + + image = make_image(self.INPUT_SIZE) + + with freeze_rng_state(): + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + assert_equal(actual, expected) + + def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width): + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width)) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + + actual = F.crop(bounding_boxes, **kwargs) + expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs) + + assert_equal(actual, expected, atol=1, rtol=0) + assert_equal(F.get_size(actual), F.get_size(expected)) + + @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)]) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + input_size = [s * 2 for s in output_size] + bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device) + + transform = transforms.RandomCrop(output_size) + + with freeze_rng_state(): + torch.manual_seed(seed) + params = transform.make_params([bounding_boxes]) + assert not params.pop("needs_pad") + del params["padding"] + assert params.pop("needs_crop") + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_crop_bounding_boxes(bounding_boxes, **params) + + torch.testing.assert_close(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def _reference_crop_keypoints(self, keypoints, *, top, left, height, width): + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + return reference_affine_keypoints_helper( + keypoints, affine_matrix=affine_matrix, new_canvas_size=(height, width) + ) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_keypoints_correctness(self, kwargs, dtype, device): + keypoints = make_keypoints(self.INPUT_SIZE, dtype=dtype, device=device) + + actual = F.crop(keypoints, **kwargs) + expected = self._reference_crop_keypoints(keypoints, **kwargs) + + assert_equal(actual, expected, atol=1, rtol=0) + assert_equal(F.get_size(actual), F.get_size(expected)) + + @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_keypoints_correctness(self, output_size, dtype, device, seed): + input_size = (output_size[0] * 2, output_size[1] * 2) + keypoints = make_keypoints(input_size, dtype=dtype, device=device) + + transform = transforms.RandomCrop(output_size) + + with freeze_rng_state(): + torch.manual_seed(seed) + params = transform.make_params([keypoints]) + assert not params.pop("needs_pad") + del params["padding"] + assert params.pop("needs_crop") + + torch.manual_seed(seed) + actual = transform(keypoints) + + expected = self._reference_crop_keypoints(keypoints, **params) + + assert_equal(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def test_errors(self): + with pytest.raises(ValueError, match="Please provide only two dimensions"): + transforms.RandomCrop([10, 12, 14]) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding="abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=0.5) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=[0.5, 0.5]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomCrop([10, 12], padding=1, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.RandomCrop([10, 12], padding=1, padding_mode="abc") + + +class TestErase: + INPUT_SIZE = (17, 11) + FUNCTIONAL_KWARGS = dict( + zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)]) + ) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_inplace(self, dtype, device): + input = make_image(self.INPUT_SIZE, dtype=dtype, device=device) + input_version = input._version + + output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True) + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + def test_kernel_video(self): + check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.erase_image, torch.Tensor), + (F._augment._erase_image_pil, PIL.Image.Image), + (F.erase_image, tv_tensors.Image), + (F.erase_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + input = make_input(device=device) + + with pytest.warns(UserWarning, match="currently passing through inputs of type"): + check_transform( + transforms.RandomErasing(p=1), + input, + check_v1_compatibility=not isinstance(input, PIL.Image.Image), + ) + + def _reference_erase_image(self, image, *, i, j, h, w, v): + mask = torch.zeros_like(image, dtype=torch.bool) + mask[..., i : i + h, j : j + w] = True + + # The broadcasting and type casting logic is handled automagically in the kernel through indexing + value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image) + + erased_image = torch.empty_like(image) + erased_image[mask] = value.flatten() + erased_image[~mask] = image[~mask] + + return erased_image + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_image_correctness(self, dtype, device): + image = make_image(dtype=dtype, device=device) + + actual = F.erase(image, **self.FUNCTIONAL_KWARGS) + expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS) + + assert_equal(actual, expected) + + @param_value_parametrization( + scale=[(0.1, 0.2), [0.0, 1.0]], + ratio=[(0.3, 0.7), [0.1, 5.0]], + value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, param, value, dtype, device, seed): + transform = transforms.RandomErasing(**{param: value}, p=1) + + image = make_image(dtype=dtype, device=device) + + with freeze_rng_state(): + torch.manual_seed(seed) + # This emulates the random apply check that happens before make_params is called + torch.rand(1) + params = transform.make_params([image]) + + torch.manual_seed(seed) + actual = transform(image) + + expected = self._reference_erase_image(image, **params) + + assert_equal(actual, expected) + + def test_transform_errors(self): + with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): + transforms.RandomErasing(value={}) + + with pytest.raises(ValueError, match="If value is str, it should be 'random'"): + transforms.RandomErasing(value="abc") + + with pytest.raises(TypeError, match="Scale should be a sequence"): + transforms.RandomErasing(scale=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence"): + transforms.RandomErasing(ratio=123) + + with pytest.raises(ValueError, match="Scale should be between 0 and 1"): + transforms.RandomErasing(scale=[-1, 2]) + + transform = transforms.RandomErasing(value=[1, 2, 3, 4]) + + with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): + transform.make_params([make_image()]) + + +class TestGaussianBlur: + @pytest.mark.parametrize("kernel_size", [1, 3, (3, 1), [3, 5]]) + @pytest.mark.parametrize("sigma", [None, 1.0, 1, (0.5,), [0.3], (0.3, 0.7), [0.9, 0.2]]) + def test_kernel_image(self, kernel_size, sigma): + check_kernel( + F.gaussian_blur_image, + make_image(), + kernel_size=kernel_size, + sigma=sigma, + check_scripted_vs_eager=not (isinstance(kernel_size, int) or isinstance(sigma, (float, int))), + ) + + def test_kernel_image_errors(self): + image = make_image_tensor() + + with pytest.raises(ValueError, match="kernel_size is a sequence its length should be 2"): + F.gaussian_blur_image(image, kernel_size=[1, 2, 3]) + + for kernel_size in [2, -1]: + with pytest.raises(ValueError, match="kernel_size should have odd and positive integers"): + F.gaussian_blur_image(image, kernel_size=kernel_size) + + with pytest.raises(ValueError, match="sigma is a sequence, its length should be 2"): + F.gaussian_blur_image(image, kernel_size=1, sigma=[1, 2, 3]) + + with pytest.raises(TypeError, match="sigma should be either float or sequence of floats"): + F.gaussian_blur_image(image, kernel_size=1, sigma=object()) + + with pytest.raises(ValueError, match="sigma should have positive values"): + F.gaussian_blur_image(image, kernel_size=1, sigma=-1) + + def test_kernel_video(self): + check_kernel(F.gaussian_blur_video, make_video(), kernel_size=(3, 3)) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.gaussian_blur, make_input(), kernel_size=(3, 3)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.gaussian_blur_image, torch.Tensor), + (F._misc._gaussian_blur_image_pil, PIL.Image.Image), + (F.gaussian_blur_image, tv_tensors.Image), + (F.gaussian_blur_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.gaussian_blur, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("sigma", [5, 2.0, (0.5, 2), [1.3, 2.7]]) + def test_transform(self, make_input, device, sigma): + check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device)) + + def test_assertions(self): + with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"): + transforms.GaussianBlur([10, 12, 14]) + + with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"): + transforms.GaussianBlur(4) + + with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"): + transforms.GaussianBlur(3, sigma=[1, 2, 3]) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=-1.0) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=[2.0, 1.0]) + + with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"): + transforms.GaussianBlur(3, sigma={}) + + @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]]) + def test_make_params(self, sigma): + transform = transforms.GaussianBlur(3, sigma=sigma) + params = transform.make_params([]) + + if isinstance(sigma, float): + assert params["sigma"][0] == params["sigma"][1] == sigma + elif isinstance(sigma, list) and len(sigma) == 1: + assert params["sigma"][0] == params["sigma"][1] == sigma[0] + else: + assert sigma[0] <= params["sigma"][0] <= sigma[1] + assert sigma[0] <= params["sigma"][1] <= sigma[1] + + # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3)) + # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28)) + # { + # "10_12_3__3_3_0.8": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8), + # "10_12_3__3_3_0.5": cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5), + # "10_12_3__3_5_0.8": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8), + # "10_12_3__3_5_0.5": cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5), + # "26_28_1__23_23_1.7": cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7), + # } + REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS = torch.load( + Path(__file__).parent / "assets" / "gaussian_blur_opencv_results.pt", + weights_only=False, + ) + + @pytest.mark.parametrize( + ("dimensions", "kernel_size", "sigma"), + [ + ((3, 10, 12), (3, 3), 0.8), + ((3, 10, 12), (3, 3), 0.5), + ((3, 10, 12), (3, 5), 0.8), + ((3, 10, 12), (3, 5), 0.5), + ((1, 26, 28), (23, 23), 1.7), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.float64, torch.float16]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_image_correctness(self, dimensions, kernel_size, sigma, dtype, device): + if dtype is torch.float16 and device == "cpu": + pytest.skip("The CPU implementation of float16 on CPU differs from opencv") + + num_channels, height, width = dimensions + + reference_results_key = f"{height}_{width}_{num_channels}__{kernel_size[0]}_{kernel_size[1]}_{sigma}" + expected = ( + torch.tensor(self.REFERENCE_GAUSSIAN_BLUR_IMAGE_RESULTS[reference_results_key]) + .reshape(height, width, num_channels) + .permute(2, 0, 1) + .to(dtype=dtype, device=device) + ) + + image = tv_tensors.Image( + torch.arange(num_channels * height * width, dtype=torch.uint8) + .reshape(height, width, num_channels) + .permute(2, 0, 1), + dtype=dtype, + device=device, + ) + + actual = F.gaussian_blur_image(image, kernel_size=kernel_size, sigma=sigma) + + torch.testing.assert_close(actual, expected, rtol=0, atol=1) + + +class TestGaussianNoise: + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_kernel(self, make_input): + check_kernel( + F.gaussian_noise, + make_input(dtype=torch.float32), + # This cannot pass because the noise on a batch in not per-image + check_batched_vs_unbatched=False, + ) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.gaussian_noise, make_input(dtype=torch.float32)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.gaussian_noise, torch.Tensor), + (F.gaussian_noise_image, tv_tensors.Image), + (F.gaussian_noise_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.gaussian_noise, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_transform(self, make_input): + def adapter(_, input, __): + # This transform doesn't support uint8 so we have to convert the auto-generated uint8 tensors to float32 + # Same for PIL images + for key, value in input.items(): + if isinstance(value, torch.Tensor) and not value.is_floating_point(): + input[key] = value.to(torch.float32) + if isinstance(value, PIL.Image.Image): + input[key] = F.pil_to_tensor(value).to(torch.float32) + return input + + check_transform(transforms.GaussianNoise(), make_input(dtype=torch.float32), check_sample_input=adapter) + + def test_bad_input(self): + with pytest.raises(ValueError, match="Gaussian Noise is not implemented for PIL images."): + F.gaussian_noise(make_image_pil()) + with pytest.raises(ValueError, match="Input tensor is expected to be in float dtype"): + F.gaussian_noise(make_image(dtype=torch.uint8)) + with pytest.raises(ValueError, match="sigma shouldn't be negative"): + F.gaussian_noise(make_image(dtype=torch.float32), sigma=-1) + + def test_clip(self): + img = make_image(dtype=torch.float32) + + out = F.gaussian_noise(img, mean=100, clip=False) + assert out.min() > 50 + + out = F.gaussian_noise(img, mean=100, clip=True) + assert (out == 1).all() + + out = F.gaussian_noise(img, mean=-100, clip=False) + assert out.min() < -50 + + out = F.gaussian_noise(img, mean=-100, clip=True) + assert (out == 0).all() + + +class TestAutoAugmentTransforms: + # These transforms have a lot of branches in their `forward()` passes which are conditioned on random sampling. + # It's typically very hard to test the effect on some parameters without heavy mocking logic. + # This class adds correctness tests for the kernels that are specific to those transforms. The rest of kernels, e.g. + # rotate, are tested in their respective classes. The rest of the tests here are mostly smoke tests. + + def _reference_shear_translate(self, image, *, transform_id, magnitude, interpolation, fill): + if isinstance(image, PIL.Image.Image): + input = image + else: + input = F.to_pil_image(image) + + matrix = { + "ShearX": (1, magnitude, 0, 0, 1, 0), + "ShearY": (1, 0, 0, magnitude, 1, 0), + "TranslateX": (1, 0, -int(magnitude), 0, 1, 0), + "TranslateY": (1, 0, 0, 0, 1, -int(magnitude)), + }[transform_id] + + output = input.transform( + input.size, PIL.Image.AFFINE, matrix, resample=pil_modes_mapping[interpolation], fill=fill + ) + + if isinstance(image, PIL.Image.Image): + return output + else: + return F.to_image(output) + + @pytest.mark.parametrize("transform_id", ["ShearX", "ShearY", "TranslateX", "TranslateY"]) + @pytest.mark.parametrize("magnitude", [0.3, -0.2, 0.0]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("input_type", ["Tensor", "PIL"]) + def test_correctness_shear_translate(self, transform_id, magnitude, interpolation, fill, input_type): + # ShearX/Y and TranslateX/Y are the only ops that are native to the AA transforms. They are modeled after the + # reference implementation: + # https://github.com/tensorflow/models/blob/885fda091c46c59d6c7bb5c7e760935eacc229da/research/autoaugment/augmentation_transforms.py#L273-L362 + # All other ops are checked in their respective dedicated tests. + + image = make_image(dtype=torch.uint8, device="cpu") + if input_type == "PIL": + image = F.to_pil_image(image) + + if "Translate" in transform_id: + # For TranslateX/Y magnitude is a value in pixels + magnitude *= min(F.get_size(image)) + + actual = transforms.AutoAugment()._apply_image_or_video_transform( + image, + transform_id=transform_id, + magnitude=magnitude, + interpolation=interpolation, + fill={type(image): fill}, + ) + expected = self._reference_shear_translate( + image, transform_id=transform_id, magnitude=magnitude, interpolation=interpolation, fill=fill + ) + + if input_type == "PIL": + actual, expected = F.to_image(actual), F.to_image(expected) + + if "Shear" in transform_id and input_type == "Tensor": + mae = (actual.float() - expected.float()).abs().mean() + assert mae < (12 if interpolation is transforms.InterpolationMode.NEAREST else 5) + else: + assert_close(actual, expected, rtol=0, atol=1) + + def _sample_input_adapter(self, transform, input, device): + adapted_input = {} + image_or_video_found = False + for key, value in input.items(): + if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints, tv_tensors.Mask)): + # AA transforms don't support bounding boxes or masks + continue + elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)): + if image_or_video_found: + # AA transforms only support a single image or video + continue + image_or_video_found = True + adapted_input[key] = value + return adapted_input + + @pytest.mark.parametrize( + "transform", + [transforms.AutoAugment(), transforms.RandAugment(), transforms.TrivialAugmentWide(), transforms.AugMix()], + ) + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_smoke(self, transform, make_input, dtype, device): + if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): + pytest.skip( + "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " + "will degenerate to that anyway." + ) + input = make_input(dtype=dtype, device=device) + + with freeze_rng_state(): + # By default every test starts from the same random seed. This leads to minimal coverage of the sampling + # that happens inside forward(). To avoid calling the transform multiple times to achieve higher coverage, + # we build a reproducible random seed from the input type, dtype, and device. + torch.manual_seed(hash((make_input, dtype, device))) + + # For v2, we changed the random sampling of the AA transforms. This makes it impossible to compare the v1 + # and v2 outputs without complicated mocking and monkeypatching. Thus, we skip the v1 compatibility checks + # here and only check if we can script the v2 transform and subsequently call the result. + check_transform( + transform, input, check_v1_compatibility=False, check_sample_input=self._sample_input_adapter + ) + + if type(input) is torch.Tensor and dtype is torch.uint8: + _script(transform)(input) + + def test_auto_augment_policy_error(self): + with pytest.raises(ValueError, match="provided policy"): + transforms.AutoAugment(policy=None) + + @pytest.mark.parametrize("severity", [0, 11]) + def test_aug_mix_severity_error(self, severity): + with pytest.raises(ValueError, match="severity must be between"): + transforms.AugMix(severity=severity) + + @pytest.mark.parametrize("num_ops", [-1, 1.1]) + def test_rand_augment_num_ops_error(self, num_ops): + with pytest.raises( + ValueError, + match=re.escape(f"num_ops should be a non-negative integer, but got {num_ops} instead."), + ): + transforms.RandAugment(num_ops=num_ops) + + +class TestConvertBoundingBoxFormat: + old_new_formats = list( + itertools.permutations( + [f for f in tv_tensors.BoundingBoxFormat if not tv_tensors.is_rotated_bounding_format(f)], 2 + ) + ) + old_new_formats += list( + itertools.permutations([f for f in tv_tensors.BoundingBoxFormat if tv_tensors.is_rotated_bounding_format(f)], 2) + ) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_kernel(self, old_format, new_format): + check_kernel( + F.convert_bounding_box_format, + make_bounding_boxes(format=old_format), + new_format=new_format, + old_format=old_format, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("inplace", [False, True]) + def test_kernel_noop(self, format, inplace): + input = make_bounding_boxes(format=format).as_subclass(torch.Tensor) + input_version = input._version + + output = F.convert_bounding_box_format(input, old_format=format, new_format=format, inplace=inplace) + + assert output is input + assert output.data_ptr() == input.data_ptr() + assert output._version == input_version + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_kernel_inplace(self, old_format, new_format): + input = make_bounding_boxes(format=old_format).as_subclass(torch.Tensor) + input_version = input._version + + output_out_of_place = F.convert_bounding_box_format(input, old_format=old_format, new_format=new_format) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.convert_bounding_box_format( + input, old_format=old_format, new_format=new_format, inplace=True + ) + if old_format != tv_tensors.BoundingBoxFormat.XYXYXYXY and new_format != tv_tensors.BoundingBoxFormat.XYXYXYXY: + # NOTE: BoundingBox format conversion from and to XYXYXYXY format + # cannot modify the input tensor inplace as it requires a dimension + # change. + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_functional(self, old_format, new_format): + check_functional(F.convert_bounding_box_format, make_bounding_boxes(format=old_format), new_format=new_format) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + @pytest.mark.parametrize("format_type", ["enum", "str"]) + def test_transform(self, old_format, new_format, format_type): + check_transform( + transforms.ConvertBoundingBoxFormat(new_format.name if format_type == "str" else new_format), + make_bounding_boxes(format=old_format), + ) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + def test_strings(self, old_format, new_format): + # Non-regression test for https://github.com/pytorch/vision/issues/8258 + input = make_bounding_boxes(format=old_format, canvas_size=(50, 50)) + expected = self._reference_convert_bounding_box_format(input, new_format) + + old_format = old_format.name + new_format = new_format.name + + out_functional = F.convert_bounding_box_format(input, new_format=new_format) + out_functional_tensor = F.convert_bounding_box_format( + input.as_subclass(torch.Tensor), old_format=old_format, new_format=new_format + ) + out_transform = transforms.ConvertBoundingBoxFormat(new_format)(input) + for out in (out_functional, out_functional_tensor, out_transform): + torch.testing.assert_close(out, expected) + + def _reference_convert_bounding_box_format(self, bounding_boxes, new_format): + return tv_tensors.wrap( + torchvision.ops.box_convert( + bounding_boxes.as_subclass(torch.Tensor), + in_fmt=bounding_boxes.format.name.lower(), + out_fmt=new_format.name.lower(), + ).to(bounding_boxes.dtype), + like=bounding_boxes, + format=new_format, + ) + + @pytest.mark.parametrize(("old_format", "new_format"), old_new_formats) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn_type", ["functional", "transform"]) + def test_correctness(self, old_format, new_format, dtype, device, fn_type): + if not dtype.is_floating_point and ( + tv_tensors.is_rotated_bounding_format(old_format) or tv_tensors.is_rotated_bounding_format(new_format) + ): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=old_format, dtype=dtype, device=device) + + if fn_type == "functional": + fn = functools.partial(F.convert_bounding_box_format, new_format=new_format) + else: + fn = transforms.ConvertBoundingBoxFormat(format=new_format) + + actual = fn(bounding_boxes) + expected = self._reference_convert_bounding_box_format(bounding_boxes, new_format) + + torch.testing.assert_close(actual, expected) + + def test_errors(self): + input_tv_tensor = make_bounding_boxes() + input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor) + + for input in [input_tv_tensor, input_pure_tensor]: + with pytest.raises(TypeError, match="missing 1 required argument: 'new_format'"): + F.convert_bounding_box_format(input) + + with pytest.raises(ValueError, match="`old_format` has to be passed"): + F.convert_bounding_box_format(input_pure_tensor, new_format=input_tv_tensor.format) + + with pytest.raises(ValueError, match="`old_format` must not be passed"): + F.convert_bounding_box_format( + input_tv_tensor, old_format=input_tv_tensor.format, new_format=input_tv_tensor.format + ) + + +class TestResizedCrop: + INPUT_SIZE = (17, 11) + CROP_KWARGS = dict(top=2, left=2, height=5, width=7) + OUTPUT_SIZE = (19, 32) + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.resized_crop_image, make_image), + (F.resized_crop_bounding_boxes, make_bounding_boxes), + (F.resized_crop_mask, make_segmentation_mask), + (F.resized_crop_mask, make_detection_masks), + (F.resized_crop_video, make_video), + (F.resized_crop_keypoints, make_keypoints), + ], + ) + def test_kernel(self, kernel, make_input): + input = make_input(self.INPUT_SIZE) + if isinstance(input, tv_tensors.BoundingBoxes): + extra_kwargs = dict(format=input.format) + elif isinstance(input, (tv_tensors.Mask, tv_tensors.KeyPoints)): + extra_kwargs = dict() + else: + extra_kwargs = dict(antialias=True) + + check_kernel(kernel, input, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, **extra_kwargs) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional( + F.resized_crop, make_input(self.INPUT_SIZE), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, antialias=True + ) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.resized_crop_image, torch.Tensor), + (F._geometry._resized_crop_image_pil, PIL.Image.Image), + (F.resized_crop_image, tv_tensors.Image), + (F.resized_crop_mask, tv_tensors.Mask), + (F.resized_crop_video, tv_tensors.Video), + (F.resized_crop_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.resized_crop, kernel=kernel, input_type=input_type) + + @param_value_parametrization( + scale=[(0.1, 0.2), [0.0, 1.0]], + ratio=[(0.3, 0.7), [0.1, 5.0]], + ) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_transform(self, param, value, make_input): + check_transform( + transforms.RandomResizedCrop(size=self.OUTPUT_SIZE, **{param: value}, antialias=True), + make_input(self.INPUT_SIZE), + check_v1_compatibility=dict(rtol=0, atol=1), + ) + + # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. + # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) + def test_functional_image_correctness(self, interpolation): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + + actual = F.resized_crop( + image, **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation, antialias=True + ) + expected = F.to_image( + F.resized_crop( + F.to_pil_image(image), **self.CROP_KWARGS, size=self.OUTPUT_SIZE, interpolation=interpolation + ) + ) + + torch.testing.assert_close(actual, expected, atol=1, rtol=0) + + def _reference_resized_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width, size): + new_height, new_width = size + + crop_affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + [0, 0, 1], + ], + ) + resize_affine_matrix = np.array( + [ + [new_width / width, 0, 0], + [0, new_height / height, 0], + [0, 0, 1], + ], + ) + + affine_matrix = (resize_affine_matrix @ crop_affine_matrix)[:2, :] + + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + + return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=size, clamp=False) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_functional_bounding_boxes_correctness(self, format): + # Note that we don't want to clamp because in + # _reference_resized_crop_bounding_boxes we are fusing the crop and the + # resize operation, where none of the croppings happen - particularly, + # the intermediate one. + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, clamping_mode=None) + + actual = F.resized_crop(bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE) + expected = self._reference_resized_crop_bounding_boxes( + bounding_boxes, **self.CROP_KWARGS, size=self.OUTPUT_SIZE + ) + + torch.testing.assert_close(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def _reference_resized_crop_keypoints(self, keypoints, *, top, left, height, width, size): + new_height, new_width = size + + crop_affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + [0, 0, 1], + ], + ) + resize_affine_matrix = np.array( + [ + [new_width / width, 0, 0], + [0, new_height / height, 0], + [0, 0, 1], + ], + ) + intermediate_keypoints = reference_affine_keypoints_helper( + keypoints, + affine_matrix=crop_affine_matrix, + new_canvas_size=(height, width), + ) + return reference_affine_keypoints_helper( + intermediate_keypoints, + affine_matrix=resize_affine_matrix, + new_canvas_size=size, + ) + + def test_functional_keypoints_correctness(self): + keypoints = make_keypoints(self.INPUT_SIZE) + + actual = F.resized_crop(keypoints, **self.CROP_KWARGS, size=self.OUTPUT_SIZE) + expected = self._reference_resized_crop_keypoints(keypoints, **self.CROP_KWARGS, size=self.OUTPUT_SIZE) + + assert_equal(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def test_transform_errors_warnings(self): + with pytest.raises(ValueError, match="provide only two dimensions"): + transforms.RandomResizedCrop(size=(1, 2, 3)) + + with pytest.raises(TypeError, match="Scale should be a sequence of two floats."): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, scale=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence of two floats."): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, ratio=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence of two floats."): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, ratio=[1, 2, 3]) + + with pytest.raises(TypeError, match="Scale should be a sequence of two floats."): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, scale=[1, 2, 3]) + + for param in ["scale", "ratio"]: + with pytest.warns(match="Scale and ratio should be of kind"): + transforms.RandomResizedCrop(size=self.INPUT_SIZE, **{param: [1, 0]}) + + +class TestPad: + EXHAUSTIVE_TYPE_PADDINGS = [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] + CORRECTNESS_PADDINGS = [ + padding + for padding in EXHAUSTIVE_TYPE_PADDINGS + if isinstance(padding, int) or isinstance(padding, list) and len(padding) > 1 + ] + PADDING_MODES = ["constant", "symmetric", "edge", "reflect"] + + @param_value_parametrization( + padding=EXHAUSTIVE_TYPE_PADDINGS, + fill=EXHAUSTIVE_TYPE_FILLS, + padding_mode=PADDING_MODES, + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "fill": + value = adapt_fill(value, dtype=dtype) + kwargs = {param: value} + if param != "padding": + kwargs["padding"] = [1] + + image = make_image(dtype=dtype, device=device) + + check_kernel( + F.pad_image, + image, + **kwargs, + check_scripted_vs_eager=not ( + (param == "padding" and isinstance(value, int)) + # See https://github.com/pytorch/vision/pull/7252#issue-1585585521 for details + or ( + param == "fill" + and ( + isinstance(value, tuple) or (isinstance(value, list) and any(isinstance(v, int) for v in value)) + ) + ) + ), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, format): + bounding_boxes = make_bounding_boxes(format=format) + check_kernel( + F.pad_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + padding=[1], + ) + + @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"]) + def test_kernel_bounding_boxes_errors(self, padding_mode): + bounding_boxes = make_bounding_boxes() + with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"): + F.pad_bounding_boxes( + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + padding=[1], + padding_mode=padding_mode, + ) + + def test_kernel_keypoints(self): + keypoints = make_keypoints() + check_kernel( + F.pad_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + padding=[1], + ) + + @pytest.mark.parametrize("padding_mode", ["symmetric", "edge", "reflect"]) + def test_kernel_keypoints_errors(self, padding_mode): + keypoints = make_keypoints() + with pytest.raises(ValueError, match=f"'{padding_mode}' is not supported"): + F.pad_keypoints( + keypoints, + canvas_size=keypoints.canvas_size, + padding=[1], + padding_mode=padding_mode, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.pad_mask, make_mask(), padding=[1]) + + @pytest.mark.parametrize("fill", [[1], (0,), [1, 0, 1], (0, 1, 0)]) + def test_kernel_mask_errors(self, fill): + with pytest.raises(ValueError, match="Non-scalar fill value is not supported"): + F.pad_mask(make_segmentation_mask(), padding=[1], fill=fill) + + def test_kernel_video(self): + check_kernel(F.pad_video, make_video(), padding=[1]) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.pad, make_input(), padding=[1]) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.pad_image, torch.Tensor), + # The PIL kernel uses fill=0 as default rather than fill=None as all others. + # Since the whole fill story is already really inconsistent, we won't introduce yet another case to allow + # for this test to pass. + # See https://github.com/pytorch/vision/issues/6623 for a discussion. + # (F._geometry._pad_image_pil, PIL.Image.Image), + (F.pad_image, tv_tensors.Image), + (F.pad_bounding_boxes, tv_tensors.BoundingBoxes), + (F.pad_mask, tv_tensors.Mask), + (F.pad_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.pad, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_transform(self, make_input): + check_transform(transforms.Pad(padding=[1]), make_input()) + + def test_transform_errors(self): + with pytest.raises(ValueError, match="Padding must be"): + transforms.Pad("abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"): + transforms.Pad([-0.7, 0, 0.7]) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"): + transforms.Pad(0.5) + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4 element of tuple or list"): + transforms.Pad(padding=[0.5, 0.5]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.Pad(12, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.Pad(12, padding_mode="abc") + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize( + ("padding_mode", "fill"), + [ + *[("constant", fill) for fill in CORRECTNESS_FILLS], + *[(padding_mode, None) for padding_mode in ["symmetric", "edge", "reflect"]], + ], + ) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_image_correctness(self, padding, padding_mode, fill, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = fn(image, padding=padding, padding_mode=padding_mode, fill=fill) + expected = F.to_image(F.pad(F.to_pil_image(image), padding=padding, padding_mode=padding_mode, fill=fill)) + + assert_equal(actual, expected) + + def _reference_pad_bounding_boxes(self, bounding_boxes, *, padding): + if isinstance(padding, int): + padding = [padding] + left, top, right, bottom = padding * (4 // len(padding)) + + affine_matrix = np.array( + [ + [1, 0, left], + [0, 1, top], + ], + ) + + height = bounding_boxes.canvas_size[0] + top + bottom + width = bounding_boxes.canvas_size[1] + left + right + + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width)) + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_bounding_boxes_correctness(self, padding, format, dtype, device, fn): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + actual = fn(bounding_boxes, padding=padding) + expected = self._reference_pad_bounding_boxes(bounding_boxes, padding=padding) + + torch.testing.assert_close(actual, expected) + + def _reference_pad_keypoints(self, keypoints, *, padding): + if isinstance(padding, int): + padding = [padding] + left, top, right, bottom = padding * (4 // len(padding)) + + affine_matrix = np.array( + [ + [1, 0, left], + [0, 1, top], + ], + ) + + height = keypoints.canvas_size[0] + top + bottom + width = keypoints.canvas_size[1] + left + right + + return reference_affine_keypoints_helper( + keypoints, affine_matrix=affine_matrix, new_canvas_size=(height, width) + ) + + @pytest.mark.parametrize("padding", CORRECTNESS_PADDINGS) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.pad, transform_cls_to_functional(transforms.Pad)]) + def test_keypoints_correctness(self, padding, dtype, device, fn): + keypoints = make_keypoints(dtype=dtype, device=device) + + actual = fn(keypoints, padding=padding) + expected = self._reference_pad_keypoints(keypoints, padding=padding) + + assert_equal(actual, expected) + + +class TestCenterCrop: + INPUT_SIZE = (17, 11) + OUTPUT_SIZES = [(3, 5), (5, 3), (4, 4), (21, 9), (13, 15), (19, 14), 3, (4,), [5], INPUT_SIZE] + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, output_size, dtype, device): + check_kernel( + F.center_crop_image, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + output_size=output_size, + check_scripted_vs_eager=not isinstance(output_size, int), + ) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, output_size, format): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format) + check_kernel( + F.center_crop_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + output_size=output_size, + check_scripted_vs_eager=not isinstance(output_size, int), + ) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + def test_kernel_keypoints(self, output_size): + keypoints = make_keypoints(self.INPUT_SIZE) + check_kernel( + F.center_crop_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + output_size=output_size, + check_scripted_vs_eager=not isinstance(output_size, int), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.center_crop_mask, make_mask(), output_size=self.OUTPUT_SIZES[0]) + + def test_kernel_video(self): + check_kernel(F.center_crop_video, make_video(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0]) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.center_crop, make_input(self.INPUT_SIZE), output_size=self.OUTPUT_SIZES[0]) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.center_crop_image, torch.Tensor), + (F._geometry._center_crop_image_pil, PIL.Image.Image), + (F.center_crop_image, tv_tensors.Image), + (F.center_crop_bounding_boxes, tv_tensors.BoundingBoxes), + (F.center_crop_mask, tv_tensors.Mask), + (F.center_crop_video, tv_tensors.Video), + (F.center_crop_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.center_crop, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_transform(self, make_input): + check_transform(transforms.CenterCrop(self.OUTPUT_SIZES[0]), make_input(self.INPUT_SIZE)) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) + def test_image_correctness(self, output_size, fn): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = fn(image, output_size) + expected = F.to_image(F.center_crop(F.to_pil_image(image), output_size=output_size)) + + assert_equal(actual, expected) + + def _reference_center_crop_bounding_boxes(self, bounding_boxes, output_size): + image_height, image_width = bounding_boxes.canvas_size + if isinstance(output_size, int): + output_size = (output_size, output_size) + elif len(output_size) == 1: + output_size *= 2 + crop_height, crop_width = output_size + + top = int(round((image_height - crop_height) / 2)) + left = int(round((image_width - crop_width) / 2)) + + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + helper = ( + reference_affine_rotated_bounding_boxes_helper + if tv_tensors.is_rotated_bounding_format(bounding_boxes.format) + else reference_affine_bounding_boxes_helper + ) + return helper(bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=output_size) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) + def test_bounding_boxes_correctness(self, output_size, format, dtype, device, fn): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + + actual = fn(bounding_boxes, output_size) + expected = self._reference_center_crop_bounding_boxes(bounding_boxes, output_size) + + torch.testing.assert_close(actual, expected) + + def _reference_center_crop_keypoints(self, keypoints, output_size): + image_height, image_width = keypoints.canvas_size + if isinstance(output_size, int): + output_size = (output_size, output_size) + elif len(output_size) == 1: + output_size *= 2 + crop_height, crop_width = output_size + + top = int(round((image_height - crop_height) / 2)) + left = int(round((image_width - crop_width) / 2)) + + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + return reference_affine_keypoints_helper(keypoints, affine_matrix=affine_matrix, new_canvas_size=output_size) + + @pytest.mark.parametrize("output_size", OUTPUT_SIZES) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("fn", [F.center_crop, transform_cls_to_functional(transforms.CenterCrop)]) + def test_keypoints_correctness(self, output_size, dtype, device, fn): + keypoints = make_keypoints(self.INPUT_SIZE, dtype=dtype, device=device) + + actual = fn(keypoints, output_size) + expected = self._reference_center_crop_keypoints(keypoints, output_size) + + assert_equal(actual, expected) + + +class TestPerspective: + COEFFICIENTS = [ + [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], + [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], + ] + START_END_POINTS = [ + ([[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]), + ([[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]), + ([[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]), + ] + MINIMAL_KWARGS = dict(startpoints=None, endpoints=None, coefficients=COEFFICIENTS[0]) + + @param_value_parametrization( + coefficients=COEFFICIENTS, + start_end_points=START_END_POINTS, + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "start_end_points": + kwargs = dict(zip(["startpoints", "endpoints"], value)) + else: + kwargs = {"startpoints": None, "endpoints": None, param: value} + if param == "fill": + kwargs["coefficients"] = self.COEFFICIENTS[0] + + check_kernel( + F.perspective_image, + make_image(dtype=dtype, device=device), + **kwargs, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + ) + + def test_kernel_image_error(self): + image = make_image_tensor() + + with pytest.raises(ValueError, match="startpoints/endpoints or the coefficients must have non `None` values"): + F.perspective_image(image, startpoints=None, endpoints=None) + + with pytest.raises( + ValueError, match="startpoints/endpoints and the coefficients shouldn't be defined concurrently" + ): + startpoints, endpoints = self.START_END_POINTS[0] + coefficients = self.COEFFICIENTS[0] + F.perspective_image(image, startpoints=startpoints, endpoints=endpoints, coefficients=coefficients) + + with pytest.raises(ValueError, match="coefficients should have 8 float values"): + F.perspective_image(image, startpoints=None, endpoints=None, coefficients=list(range(7))) + + @param_value_parametrization( + coefficients=COEFFICIENTS, + start_end_points=START_END_POINTS, + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + def test_kernel_bounding_boxes(self, param, value, format): + if param == "start_end_points": + kwargs = dict(zip(["startpoints", "endpoints"], value)) + else: + kwargs = {"startpoints": None, "endpoints": None, param: value} + + bounding_boxes = make_bounding_boxes(format=format) + + check_kernel( + F.perspective_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + **kwargs, + ) + + def test_kernel_bounding_boxes_error(self): + bounding_boxes = make_bounding_boxes() + format, canvas_size = bounding_boxes.format, bounding_boxes.canvas_size + bounding_boxes = bounding_boxes.as_subclass(torch.Tensor) + + with pytest.raises(RuntimeError, match="Denominator is zero"): + F.perspective_bounding_boxes( + bounding_boxes, + format=format, + canvas_size=canvas_size, + startpoints=None, + endpoints=None, + coefficients=[0.0] * 8, + ) + + @param_value_parametrization( + coefficients=COEFFICIENTS, + start_end_points=START_END_POINTS, + ) + def test_kernel_keypoints(self, param, value): + if param == "start_end_points": + kwargs = dict(zip(["startpoints", "endpoints"], value)) + else: + kwargs = {"startpoints": None, "endpoints": None, param: value} + + keypoints = make_keypoints() + + check_kernel( + F.perspective_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + **kwargs, + ) + + def test_kernel_keypoints_error(self): + keypoints = make_keypoints() + canvas_size = keypoints.canvas_size + keypoints = keypoints.as_subclass(torch.Tensor) + + with pytest.raises(RuntimeError, match="Denominator is zero"): + F.perspective_keypoints( + keypoints, + canvas_size=canvas_size, + startpoints=None, + endpoints=None, + coefficients=[0.0] * 8, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_masks]) + def test_kernel_mask(self, make_mask): + check_kernel(F.perspective_mask, make_mask(), **self.MINIMAL_KWARGS) + + def test_kernel_video(self): + check_kernel(F.perspective_video, make_video(), **self.MINIMAL_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_functional(self, make_input): + check_functional(F.perspective, make_input(), **self.MINIMAL_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.perspective_image, torch.Tensor), + (F._geometry._perspective_image_pil, PIL.Image.Image), + (F.perspective_image, tv_tensors.Image), + (F.perspective_bounding_boxes, tv_tensors.BoundingBoxes), + (F.perspective_mask, tv_tensors.Mask), + (F.perspective_video, tv_tensors.Video), + (F.perspective_keypoints, tv_tensors.KeyPoints), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.perspective, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("distortion_scale", [0.5, 0.0, 1.0]) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_video, + make_keypoints, + ], + ) + def test_transform(self, distortion_scale, make_input): + check_transform(transforms.RandomPerspective(distortion_scale=distortion_scale, p=1), make_input()) + + @pytest.mark.parametrize("distortion_scale", [-1, 2]) + def test_transform_error(self, distortion_scale): + with pytest.raises(ValueError, match="distortion_scale value should be between 0 and 1"): + transforms.RandomPerspective(distortion_scale=distortion_scale) + + @pytest.mark.parametrize("coefficients", COEFFICIENTS) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_image_functional_correctness(self, coefficients, interpolation, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.perspective( + image, startpoints=None, endpoints=None, coefficients=coefficients, interpolation=interpolation, fill=fill + ) + expected = F.to_image( + F.perspective( + F.to_pil_image(image), + startpoints=None, + endpoints=None, + coefficients=coefficients, + interpolation=interpolation, + fill=fill, + ) + ) + + if interpolation is transforms.InterpolationMode.BILINEAR: + abs_diff = (actual.float() - expected.float()).abs() + assert (abs_diff > 1).float().mean() < 7e-2 + mae = abs_diff.mean() + assert mae < 3 + else: + assert_equal(actual, expected) + + def _reference_perspective_bounding_boxes(self, bounding_boxes, *, startpoints, endpoints): + format = bounding_boxes.format + canvas_size = bounding_boxes.canvas_size + clamping_mode = bounding_boxes.clamping_mode + dtype = bounding_boxes.dtype + device = bounding_boxes.device + is_rotated = tv_tensors.is_rotated_bounding_format(format) + ndims = 4 + if is_rotated and format == tv_tensors.BoundingBoxFormat.XYXYXYXY: + ndims = 8 + if is_rotated and format != tv_tensors.BoundingBoxFormat.XYXYXYXY: + ndims = 5 + + coefficients = _get_perspective_coeffs(endpoints, startpoints) + + def perspective_bounding_boxes(bounding_boxes): + m1 = np.array( + [ + [coefficients[0], coefficients[1], coefficients[2]], + [coefficients[3], coefficients[4], coefficients[5]], + ] + ) + m2 = np.array( + [ + [coefficients[6], coefficients[7], 1.0], + [coefficients[6], coefficients[7], 1.0], + ] + ) + + if is_rotated: + input_xyxyxyxy = F.convert_bounding_box_format( + bounding_boxes.to(device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXYXYXY, + inplace=True, + ) + x1, y1, x2, y2, x3, y3, x4, y4 = input_xyxyxyxy.squeeze(0).tolist() + points = np.array( + [ + [x1, y1, 1.0], + [x2, y2, 1.0], + [x3, y3, 1.0], + [x4, y4, 1.0], + ] + ) + + else: + # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 + input_xyxy = F.convert_bounding_box_format( + bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXY, + inplace=True, + ) + x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() + + points = np.array( + [ + [x1, y1, 1.0], + [x2, y1, 1.0], + [x1, y2, 1.0], + [x2, y2, 1.0], + ] + ) + + numerator = points @ m1.astype(points.dtype).T + denominator = points @ m2.astype(points.dtype).T + transformed_points = numerator / denominator + + if is_rotated: + output = torch.Tensor( + [ + float(transformed_points[0, 0]), + float(transformed_points[0, 1]), + float(transformed_points[1, 0]), + float(transformed_points[1, 1]), + float(transformed_points[2, 0]), + float(transformed_points[2, 1]), + float(transformed_points[3, 0]), + float(transformed_points[3, 1]), + ] + ) + output = _parallelogram_to_bounding_boxes(output) + else: + output = torch.Tensor( + [ + float(np.min(transformed_points[:, 0])), + float(np.min(transformed_points[:, 1])), + float(np.max(transformed_points[:, 0])), + float(np.max(transformed_points[:, 1])), + ] + ) + + output = F.convert_bounding_box_format( + output, + old_format=tv_tensors.BoundingBoxFormat.XYXYXYXY if is_rotated else tv_tensors.BoundingBoxFormat.XYXY, + new_format=format, + ) + + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + return F.clamp_bounding_boxes( + output, + format=format, + canvas_size=canvas_size, + clamping_mode=clamping_mode, + ).to(dtype=dtype, device=device) + + return tv_tensors.BoundingBoxes( + torch.cat( + [perspective_bounding_boxes(b) for b in bounding_boxes.reshape(-1, ndims).unbind()], dim=0 + ).reshape(bounding_boxes.shape), + format=format, + canvas_size=canvas_size, + ) + + @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_correctness_perspective_bounding_boxes(self, startpoints, endpoints, format, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + actual = F.perspective(bounding_boxes, startpoints=startpoints, endpoints=endpoints) + expected = self._reference_perspective_bounding_boxes( + bounding_boxes, startpoints=startpoints, endpoints=endpoints + ) + + assert_close(actual, expected, rtol=0, atol=1) + + def _reference_perspective_keypoints(self, keypoints, *, startpoints, endpoints): + canvas_size = keypoints.canvas_size + dtype = keypoints.dtype + device = keypoints.device + + coefficients = _get_perspective_coeffs(endpoints, startpoints) + + def perspective_keypoints(keypoints): + m1 = np.array( + [ + [coefficients[0], coefficients[1], coefficients[2]], + [coefficients[3], coefficients[4], coefficients[5]], + ] + ) + m2 = np.array( + [ + [coefficients[6], coefficients[7], 1.0], + [coefficients[6], coefficients[7], 1.0], + ] + ) + + # Go to float before converting to prevent precision loss + x, y = keypoints.to(dtype=torch.float64, device="cpu", copy=True).squeeze(0).tolist() + + points = np.array([[x, y, 1.0]]) + + numerator = points @ m1.T + denominator = points @ m2.T + transformed_points = numerator / denominator + + output = torch.Tensor( + [ + float(transformed_points[0, 0]), + float(transformed_points[0, 1]), + ] + ) + + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + return F.clamp_keypoints( + output, + canvas_size=canvas_size, + ).to(dtype=dtype, device=device) + + return tv_tensors.KeyPoints( + torch.cat([perspective_keypoints(k) for k in keypoints.reshape(-1, 2).unbind()], dim=0).reshape( + keypoints.shape + ), + canvas_size=canvas_size, + ) + + @pytest.mark.parametrize(("startpoints", "endpoints"), START_END_POINTS) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_correctness_perspective_keypoints(self, startpoints, endpoints, dtype, device): + keypoints = make_keypoints(dtype=dtype, device=device) + + actual = F.perspective(keypoints, startpoints=startpoints, endpoints=endpoints) + expected = self._reference_perspective_keypoints(keypoints, startpoints=startpoints, endpoints=endpoints) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestEqualize: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.equalize_image, make_image(dtype=dtype, device=device)) + + def test_kernel_video(self): + check_kernel(F.equalize_image, make_video()) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.equalize, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.equalize_image, torch.Tensor), + (F._color._equalize_image_pil, PIL.Image.Image), + (F.equalize_image, tv_tensors.Image), + (F.equalize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.equalize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_transform(self, make_input): + check_transform(transforms.RandomEqualize(p=1), make_input()) + + @pytest.mark.parametrize(("low", "high"), [(0, 64), (64, 192), (192, 256), (0, 1), (127, 128), (255, 256)]) + @pytest.mark.parametrize("fn", [F.equalize, transform_cls_to_functional(transforms.RandomEqualize, p=1)]) + def test_image_correctness(self, low, high, fn): + # We are not using the default `make_image` here since that uniformly samples the values over the whole value + # range. Since the whole point of F.equalize is to transform an arbitrary distribution of values into a uniform + # one over the full range, the information gain is low if we already provide something really close to the + # expected value. + image = tv_tensors.Image( + torch.testing.make_tensor((3, 117, 253), dtype=torch.uint8, device="cpu", low=low, high=high) + ) + + actual = fn(image) + expected = F.to_image(F.equalize(F.to_pil_image(image))) + + assert_equal(actual, expected) + + +class TestUniformTemporalSubsample: + def test_kernel_video(self): + check_kernel(F.uniform_temporal_subsample_video, make_video(), num_samples=2) + + @pytest.mark.parametrize("make_input", [make_video_tensor, make_video]) + def test_functional(self, make_input): + check_functional(F.uniform_temporal_subsample, make_input(), num_samples=2) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.uniform_temporal_subsample_video, torch.Tensor), + (F.uniform_temporal_subsample_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.uniform_temporal_subsample, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_video_tensor, make_video]) + def test_transform(self, make_input): + check_transform(transforms.UniformTemporalSubsample(num_samples=2), make_input()) + + def _reference_uniform_temporal_subsample_video(self, video, *, num_samples): + # Adapted from + # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19 + t = video.shape[-4] + assert num_samples > 0 and t > 0 + # Sample by nearest neighbor interpolation if num_samples > t. + indices = torch.linspace(0, t - 1, num_samples, device=video.device) + indices = torch.clamp(indices, 0, t - 1).long() + return tv_tensors.Video(torch.index_select(video, -4, indices)) + + CORRECTNESS_NUM_FRAMES = 5 + + @pytest.mark.parametrize("num_samples", list(range(1, CORRECTNESS_NUM_FRAMES + 1))) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize( + "fn", [F.uniform_temporal_subsample, transform_cls_to_functional(transforms.UniformTemporalSubsample)] + ) + def test_video_correctness(self, num_samples, dtype, device, fn): + video = make_video(num_frames=self.CORRECTNESS_NUM_FRAMES, dtype=dtype, device=device) + + actual = fn(video, num_samples=num_samples) + expected = self._reference_uniform_temporal_subsample_video(video, num_samples=num_samples) + + assert_equal(actual, expected) + + +class TestNormalize: + MEANS_STDS = [ + ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]), + ] + MEAN, STD = MEANS_STDS[0] + + @pytest.mark.parametrize(("mean", "std"), [*MEANS_STDS, (0.5, 2.0)]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, mean, std, device): + check_kernel(F.normalize_image, make_image(dtype=torch.float32, device=device), mean=self.MEAN, std=self.STD) + + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_inplace(self, device): + input = make_image_tensor(dtype=torch.float32, device=device) + input_version = input._version + + output_out_of_place = F.normalize_image(input, mean=self.MEAN, std=self.STD) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.normalize_image(input, mean=self.MEAN, std=self.STD, inplace=True) + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + def test_kernel_video(self): + check_kernel(F.normalize_video, make_video(dtype=torch.float32), mean=self.MEAN, std=self.STD) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.normalize, make_input(dtype=torch.float32), mean=self.MEAN, std=self.STD) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.normalize_image, torch.Tensor), + (F.normalize_image, tv_tensors.Image), + (F.normalize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.normalize, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="should be a float tensor"): + F.normalize_image(make_image(dtype=torch.uint8), mean=self.MEAN, std=self.STD) + + with pytest.raises(ValueError, match="tensor image of size"): + F.normalize_image(torch.rand(16, 16, dtype=torch.float32), mean=self.MEAN, std=self.STD) + + for std in [0, [0, 0, 0], [0, 1, 1]]: + with pytest.raises(ValueError, match="std evaluated to zero, leading to division by zero"): + F.normalize_image(make_image(dtype=torch.float32), mean=self.MEAN, std=std) + + def _sample_input_adapter(self, transform, input, device): + adapted_input = {} + for key, value in input.items(): + if isinstance(value, PIL.Image.Image): + # normalize doesn't support PIL images + continue + elif check_type(value, (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)): + # normalize doesn't support integer images + value = F.to_dtype(value, torch.float32, scale=True) + adapted_input[key] = value + return adapted_input + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + def test_transform(self, make_input): + check_transform( + transforms.Normalize(mean=self.MEAN, std=self.STD), + make_input(dtype=torch.float32), + check_sample_input=self._sample_input_adapter, + ) + + def _reference_normalize_image(self, image, *, mean, std): + image = image.numpy() + mean, std = (np.array(stat, dtype=image.dtype).reshape((-1, 1, 1)) for stat in [mean, std]) + return tv_tensors.Image((image - mean) / std) + + @pytest.mark.parametrize(("mean", "std"), MEANS_STDS) + @pytest.mark.parametrize("dtype", [torch.float16, torch.float32, torch.float64]) + @pytest.mark.parametrize("fn", [F.normalize, transform_cls_to_functional(transforms.Normalize)]) + def test_correctness_image(self, mean, std, dtype, fn): + image = make_image(dtype=dtype) + + actual = fn(image, mean=mean, std=std) + expected = self._reference_normalize_image(image, mean=mean, std=std) + + assert_equal(actual, expected) + + +class TestClampBoundingBoxes: + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("clamping_mode", ("soft", "hard", None)) + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, format, clamping_mode, dtype, device): + if not dtype.is_floating_point and tv_tensors.is_rotated_bounding_format(format): + pytest.xfail("Rotated bounding boxes should be floating point tensors") + bounding_boxes = make_bounding_boxes(format=format, clamping_mode=clamping_mode, dtype=dtype, device=device) + check_kernel( + F.clamp_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + clamping_mode=clamping_mode, + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("clamping_mode", ("soft", "hard", None)) + def test_functional(self, format, clamping_mode): + check_functional(F.clamp_bounding_boxes, make_bounding_boxes(format=format, clamping_mode=clamping_mode)) + + def test_errors(self): + input_tv_tensor = make_bounding_boxes() + input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor) + format, canvas_size = input_tv_tensor.format, input_tv_tensor.canvas_size + + for format_, canvas_size_, clamping_mode_ in itertools.product( + (format, None), (canvas_size, None), (input_tv_tensor.clamping_mode, "auto") + ): + with pytest.raises( + ValueError, + match="For pure tensor inputs, `format`, `canvas_size` and `clamping_mode` have to be passed.", + ): + F.clamp_bounding_boxes(input_pure_tensor, format=format_, canvas_size=canvas_size_) + + for format_, canvas_size_ in [(format, canvas_size), (format, None), (None, canvas_size)]: + with pytest.raises( + ValueError, match="For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed." + ): + F.clamp_bounding_boxes(input_tv_tensor, format=format_, canvas_size=canvas_size_) + + with pytest.raises(ValueError, match="clamping_mode must be soft,"): + F.clamp_bounding_boxes(input_tv_tensor, clamping_mode="bad") + with pytest.raises(ValueError, match="clamping_mode must be soft,"): + transforms.ClampBoundingBoxes(clamping_mode="bad")(input_tv_tensor) + + def test_transform(self): + check_transform(transforms.ClampBoundingBoxes(), make_bounding_boxes()) + + @pytest.mark.parametrize("rotated", (True, False)) + @pytest.mark.parametrize("constructor_clamping_mode", ("soft", "hard", None)) + @pytest.mark.parametrize("clamping_mode", ("soft", "hard", None, "auto")) + @pytest.mark.parametrize("pass_pure_tensor", (True, False)) + @pytest.mark.parametrize("fn", [F.clamp_bounding_boxes, transform_cls_to_functional(transforms.ClampBoundingBoxes)]) + def test_clamping_mode(self, rotated, constructor_clamping_mode, clamping_mode, pass_pure_tensor, fn): + # This test checks 2 things: + # - That passing clamping_mode=None to the clamp_bounding_boxes + # functional (or to the class) relies on the box's `.clamping_mode` + # attribute + # - That clamping happens when it should, and only when it should, i.e. + # when the clamping mode is not None. It doesn't validate the + # numerical results, only that clamping happened. For that, we create + # a large 100x100 box inside of a small 10x10 image. + + if pass_pure_tensor and fn is not F.clamp_bounding_boxes: + # Only the functional supports pure tensors, not the class + return + if pass_pure_tensor and clamping_mode == "auto": + # cannot leave clamping_mode="auto" when passing pure tensor + return + + if rotated: + boxes = tv_tensors.BoundingBoxes( + [0.0, 0.0, 100.0, 100.0, 0.0], + format="XYWHR", + canvas_size=(10, 10), + clamping_mode=constructor_clamping_mode, + ) + expected_clamped_output = torch.tensor([[0.0, 0.0, 10.0, 10.0, 0.0]]) + else: + boxes = tv_tensors.BoundingBoxes( + [0, 100, 0, 100], format="XYXY", canvas_size=(10, 10), clamping_mode=constructor_clamping_mode + ) + expected_clamped_output = torch.tensor([[0, 10, 0, 10]]) + + if pass_pure_tensor: + out = fn( + boxes.as_subclass(torch.Tensor), + format=boxes.format, + canvas_size=boxes.canvas_size, + clamping_mode=clamping_mode, + ) + else: + out = fn(boxes, clamping_mode=clamping_mode) + + clamping_mode_prevailing = constructor_clamping_mode if clamping_mode == "auto" else clamping_mode + if clamping_mode_prevailing is None: + assert_equal(boxes, out) # should be a pass-through + else: + assert_equal(out, expected_clamped_output) + + +class TestSetClampingMode: + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("constructor_clamping_mode", ("soft", "hard", None)) + @pytest.mark.parametrize("desired_clamping_mode", ("soft", "hard", None)) + def test_setter(self, format, constructor_clamping_mode, desired_clamping_mode): + + in_boxes = make_bounding_boxes(format=format, clamping_mode=constructor_clamping_mode) + out_boxes = transforms.SetClampingMode(clamping_mode=desired_clamping_mode)(in_boxes) + + assert in_boxes.clamping_mode == constructor_clamping_mode # input is unchanged: no leak + assert out_boxes.clamping_mode == desired_clamping_mode + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("constructor_clamping_mode", ("soft", "hard", None)) + def test_pipeline_no_leak(self, format, constructor_clamping_mode): + class AssertClampingMode(transforms.Transform): + def __init__(self, expected_clamping_mode): + super().__init__() + self.expected_clamping_mode = expected_clamping_mode + + _transformed_types = (tv_tensors.BoundingBoxes,) + + def transform(self, inpt, _): + assert inpt.clamping_mode == self.expected_clamping_mode + return inpt + + t = transforms.Compose( + [ + transforms.SetClampingMode(None), + AssertClampingMode(None), + transforms.SetClampingMode("hard"), + AssertClampingMode("hard"), + transforms.SetClampingMode(None), + AssertClampingMode(None), + transforms.ClampBoundingBoxes("hard"), + ] + ) + + in_boxes = make_bounding_boxes(format=format, clamping_mode=constructor_clamping_mode) + out_boxes = t(in_boxes) + + assert in_boxes.clamping_mode == constructor_clamping_mode # input is unchanged: no leak + + # assert that the output boxes clamping_mode is the one set by the last SetClampingMode. + # ClampBoundingBoxes doesn't set clamping_mode. + assert out_boxes.clamping_mode is None + + def test_error(self): + with pytest.raises(ValueError, match="clamping_mode must be"): + transforms.SetClampingMode("bad") + + +class TestClampKeyPoints: + @pytest.mark.parametrize("dtype", [torch.int64, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, dtype, device): + keypoints = make_keypoints(dtype=dtype, device=device) + check_kernel( + F.clamp_keypoints, + keypoints, + canvas_size=keypoints.canvas_size, + ) + + def test_functional(self): + check_functional(F.clamp_keypoints, make_keypoints()) + + def test_errors(self): + input_tv_tensor = make_keypoints() + input_pure_tensor = input_tv_tensor.as_subclass(torch.Tensor) + + with pytest.raises(ValueError, match="`canvas_size` has to be passed"): + F.clamp_keypoints(input_pure_tensor, canvas_size=None) + + with pytest.raises(ValueError, match="`canvas_size` must not be passed"): + F.clamp_keypoints(input_tv_tensor, canvas_size=input_tv_tensor.canvas_size) + + def test_transform(self): + check_transform(transforms.ClampKeyPoints(), make_keypoints()) + + +class TestInvert: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.int16, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.invert_image, make_image(dtype=dtype, device=device)) + + def test_kernel_video(self): + check_kernel(F.invert_video, make_video()) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.invert, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.invert_image, torch.Tensor), + (F._color._invert_image_pil, PIL.Image.Image), + (F.invert_image, tv_tensors.Image), + (F.invert_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.invert, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomInvert(p=1), make_input()) + + @pytest.mark.parametrize("fn", [F.invert, transform_cls_to_functional(transforms.RandomInvert, p=1)]) + def test_correctness_image(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.invert(F.to_pil_image(image))) + + assert_equal(actual, expected) + + +class TestPosterize: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.posterize_image, make_image(dtype=dtype, device=device), bits=1) + + def test_kernel_video(self): + check_kernel(F.posterize_video, make_video(), bits=1) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.posterize, make_input(), bits=1) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.posterize_image, torch.Tensor), + (F._color._posterize_image_pil, PIL.Image.Image), + (F.posterize_image, tv_tensors.Image), + (F.posterize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.posterize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomPosterize(bits=1, p=1), make_input()) + + @pytest.mark.parametrize("bits", [1, 4, 8]) + @pytest.mark.parametrize("fn", [F.posterize, transform_cls_to_functional(transforms.RandomPosterize, p=1)]) + def test_correctness_image(self, bits, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image, bits=bits) + expected = F.to_image(F.posterize(F.to_pil_image(image), bits=bits)) + + assert_equal(actual, expected) + + @pytest.mark.parametrize("bits", [-1, 9, 2.1]) + def test_error_functional(self, bits): + with pytest.raises( + TypeError, + match=re.escape(f"bits must be a positive integer in the range [0, 8], got {bits} instead."), + ): + F.posterize(make_image(dtype=torch.uint8), bits=bits) + + +class TestSolarize: + def _make_threshold(self, input, *, factor=0.5): + dtype = input.dtype if isinstance(input, torch.Tensor) else torch.uint8 + return (float if dtype.is_floating_point else int)(get_max_value(dtype) * factor) + + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + image = make_image(dtype=dtype, device=device) + check_kernel(F.solarize_image, image, threshold=self._make_threshold(image)) + + def test_kernel_video(self): + video = make_video() + check_kernel(F.solarize_video, video, threshold=self._make_threshold(video)) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + input = make_input() + check_functional(F.solarize, input, threshold=self._make_threshold(input)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.solarize_image, torch.Tensor), + (F._color._solarize_image_pil, PIL.Image.Image), + (F.solarize_image, tv_tensors.Image), + (F.solarize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.solarize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize(("dtype", "threshold"), [(torch.uint8, 256), (torch.float, 1.5)]) + def test_functional_error(self, dtype, threshold): + with pytest.raises(TypeError, match="Threshold should be less or equal the maximum value of the dtype"): + F.solarize(make_image(dtype=dtype), threshold=threshold) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + input = make_input() + check_transform(transforms.RandomSolarize(threshold=self._make_threshold(input), p=1), input) + + @pytest.mark.parametrize("threshold_factor", [0.0, 0.1, 0.5, 0.9, 1.0]) + @pytest.mark.parametrize("fn", [F.solarize, transform_cls_to_functional(transforms.RandomSolarize, p=1)]) + def test_correctness_image(self, threshold_factor, fn): + image = make_image(dtype=torch.uint8, device="cpu") + threshold = self._make_threshold(image, factor=threshold_factor) + + actual = fn(image, threshold=threshold) + expected = F.to_image(F.solarize(F.to_pil_image(image), threshold=threshold)) + + assert_equal(actual, expected) + + +class TestAutocontrast: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.int16, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.autocontrast_image, make_image(dtype=dtype, device=device)) + + def test_kernel_video(self): + check_kernel(F.autocontrast_video, make_video()) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.autocontrast, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.autocontrast_image, torch.Tensor), + (F._color._autocontrast_image_pil, PIL.Image.Image), + (F.autocontrast_image, tv_tensors.Image), + (F.autocontrast_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.autocontrast, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomAutocontrast(p=1), make_input(), check_v1_compatibility=dict(rtol=0, atol=1)) + + @pytest.mark.parametrize("fn", [F.autocontrast, transform_cls_to_functional(transforms.RandomAutocontrast, p=1)]) + def test_correctness_image(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.autocontrast(F.to_pil_image(image))) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestAdjustSharpness: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_sharpness_image, make_image(dtype=dtype, device=device), sharpness_factor=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_sharpness_video, make_video(), sharpness_factor=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_sharpness, make_input(), sharpness_factor=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_sharpness_image, torch.Tensor), + (F._color._adjust_sharpness_image_pil, PIL.Image.Image), + (F.adjust_sharpness_image, tv_tensors.Image), + (F.adjust_sharpness_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_sharpness, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_transform(self, make_input): + check_transform(transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1), make_input()) + + def test_functional_error(self): + with pytest.raises(TypeError, match="can have 1 or 3 channels"): + F.adjust_sharpness(make_image(color_space="RGBA"), sharpness_factor=0.5) + + with pytest.raises(ValueError, match="is not non-negative"): + F.adjust_sharpness(make_image(), sharpness_factor=-1) + + @pytest.mark.parametrize("sharpness_factor", [0.1, 0.5, 1.0]) + @pytest.mark.parametrize( + "fn", [F.adjust_sharpness, transform_cls_to_functional(transforms.RandomAdjustSharpness, p=1)] + ) + def test_correctness_image(self, sharpness_factor, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image, sharpness_factor=sharpness_factor) + expected = F.to_image(F.adjust_sharpness(F.to_pil_image(image), sharpness_factor=sharpness_factor)) + + assert_equal(actual, expected) + + +class TestAdjustContrast: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_contrast_image, make_image(dtype=dtype, device=device), contrast_factor=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_contrast_video, make_video(), contrast_factor=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_contrast, make_input(), contrast_factor=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_contrast_image, torch.Tensor), + (F._color._adjust_contrast_image_pil, PIL.Image.Image), + (F.adjust_contrast_image, tv_tensors.Image), + (F.adjust_contrast_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_contrast, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="permitted channel values are 1 or 3"): + F.adjust_contrast(make_image(color_space="RGBA"), contrast_factor=0.5) + + with pytest.raises(ValueError, match="is not non-negative"): + F.adjust_contrast(make_image(), contrast_factor=-1) + + @pytest.mark.parametrize("contrast_factor", [0.1, 0.5, 1.0]) + def test_correctness_image(self, contrast_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_contrast(image, contrast_factor=contrast_factor) + expected = F.to_image(F.adjust_contrast(F.to_pil_image(image), contrast_factor=contrast_factor)) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestAdjustGamma: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_gamma_image, make_image(dtype=dtype, device=device), gamma=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_gamma_video, make_video(), gamma=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_gamma, make_input(), gamma=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_gamma_image, torch.Tensor), + (F._color._adjust_gamma_image_pil, PIL.Image.Image), + (F.adjust_gamma_image, tv_tensors.Image), + (F.adjust_gamma_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_gamma, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(ValueError, match="Gamma should be a non-negative real number"): + F.adjust_gamma(make_image(), gamma=-1) + + @pytest.mark.parametrize("gamma", [0.1, 0.5, 1.0]) + @pytest.mark.parametrize("gain", [0.1, 1.0, 2.0]) + def test_correctness_image(self, gamma, gain): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_gamma(image, gamma=gamma, gain=gain) + expected = F.to_image(F.adjust_gamma(F.to_pil_image(image), gamma=gamma, gain=gain)) + + assert_equal(actual, expected) + + +class TestAdjustHue: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_hue_image, make_image(dtype=dtype, device=device), hue_factor=0.25) + + def test_kernel_video(self): + check_kernel(F.adjust_hue_video, make_video(), hue_factor=0.25) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_hue, make_input(), hue_factor=0.25) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_hue_image, torch.Tensor), + (F._color._adjust_hue_image_pil, PIL.Image.Image), + (F.adjust_hue_image, tv_tensors.Image), + (F.adjust_hue_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_hue, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="permitted channel values are 1 or 3"): + F.adjust_hue(make_image(color_space="RGBA"), hue_factor=0.25) + + for hue_factor in [-1, 1]: + with pytest.raises(ValueError, match=re.escape("is not in [-0.5, 0.5]")): + F.adjust_hue(make_image(), hue_factor=hue_factor) + + @pytest.mark.parametrize("hue_factor", [-0.5, -0.3, 0.0, 0.2, 0.5]) + def test_correctness_image(self, hue_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_hue(image, hue_factor=hue_factor) + expected = F.to_image(F.adjust_hue(F.to_pil_image(image), hue_factor=hue_factor)) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 + + +class TestAdjustSaturation: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.adjust_saturation_image, make_image(dtype=dtype, device=device), saturation_factor=0.5) + + def test_kernel_video(self): + check_kernel(F.adjust_saturation_video, make_video(), saturation_factor=0.5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_pil, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_saturation, make_input(), saturation_factor=0.5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_saturation_image, torch.Tensor), + (F._color._adjust_saturation_image_pil, PIL.Image.Image), + (F.adjust_saturation_image, tv_tensors.Image), + (F.adjust_saturation_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_saturation, kernel=kernel, input_type=input_type) + + def test_functional_error(self): + with pytest.raises(TypeError, match="permitted channel values are 1 or 3"): + F.adjust_saturation(make_image(color_space="RGBA"), saturation_factor=0.5) + + with pytest.raises(ValueError, match="is not non-negative"): + F.adjust_saturation(make_image(), saturation_factor=-1) + + @pytest.mark.parametrize("saturation_factor", [0.1, 0.5, 1.0]) + def test_correctness_image(self, saturation_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_saturation(image, saturation_factor=saturation_factor) + expected = F.to_image(F.adjust_saturation(F.to_pil_image(image), saturation_factor=saturation_factor)) + + assert_close(actual, expected, rtol=0, atol=1) + + +class TestFiveTenCrop: + INPUT_SIZE = (17, 11) + OUTPUT_SIZE = (3, 5) + + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("kernel", [F.five_crop_image, F.ten_crop_image]) + def test_kernel_image(self, dtype, device, kernel): + check_kernel( + kernel, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + size=self.OUTPUT_SIZE, + check_batched_vs_unbatched=False, + ) + + @pytest.mark.parametrize("kernel", [F.five_crop_video, F.ten_crop_video]) + def test_kernel_video(self, kernel): + check_kernel(kernel, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZE, check_batched_vs_unbatched=False) + + def _functional_wrapper(self, fn): + # This wrapper is needed to make five_crop / ten_crop compatible with check_functional, since that requires a + # single output rather than a sequence. + @functools.wraps(fn) + def wrapper(*args, **kwargs): + outputs = fn(*args, **kwargs) + return outputs[0] + + return wrapper + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("functional", [F.five_crop, F.ten_crop]) + def test_functional(self, make_input, functional): + check_functional( + self._functional_wrapper(functional), + make_input(self.INPUT_SIZE), + size=self.OUTPUT_SIZE, + check_scripted_smoke=False, + ) + + @pytest.mark.parametrize( + ("functional", "kernel", "input_type"), + [ + (F.five_crop, F.five_crop_image, torch.Tensor), + (F.five_crop, F._geometry._five_crop_image_pil, PIL.Image.Image), + (F.five_crop, F.five_crop_image, tv_tensors.Image), + (F.five_crop, F.five_crop_video, tv_tensors.Video), + (F.ten_crop, F.ten_crop_image, torch.Tensor), + (F.ten_crop, F._geometry._ten_crop_image_pil, PIL.Image.Image), + (F.ten_crop, F.ten_crop_image, tv_tensors.Image), + (F.ten_crop, F.ten_crop_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, functional, kernel, input_type): + check_functional_kernel_signature_match(functional, kernel=kernel, input_type=input_type) + + class _TransformWrapper(nn.Module): + # This wrapper is needed to make FiveCrop / TenCrop compatible with check_transform, since that requires a + # single output rather than a sequence. + _v1_transform_cls = None + + def _extract_params_for_v1_transform(self): + return dict(five_ten_crop_transform=self.five_ten_crop_transform) + + def __init__(self, five_ten_crop_transform): + super().__init__() + type(self)._v1_transform_cls = type(self) + self.five_ten_crop_transform = five_ten_crop_transform + + def forward(self, input: torch.Tensor) -> torch.Tensor: + outputs = self.five_ten_crop_transform(input) + return outputs[0] + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) + def test_transform(self, make_input, transform_cls): + check_transform( + self._TransformWrapper(transform_cls(size=self.OUTPUT_SIZE)), + make_input(self.INPUT_SIZE), + check_sample_input=False, + ) + + @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_masks]) + @pytest.mark.parametrize("transform_cls", [transforms.FiveCrop, transforms.TenCrop]) + def test_transform_error(self, make_input, transform_cls): + transform = transform_cls(size=self.OUTPUT_SIZE) + + with pytest.raises(TypeError, match="not supported"): + transform(make_input(self.INPUT_SIZE)) + + @pytest.mark.parametrize("fn", [F.five_crop, transform_cls_to_functional(transforms.FiveCrop)]) + def test_correctness_image_five_crop(self, fn): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = fn(image, size=self.OUTPUT_SIZE) + expected = F.five_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE) + + assert isinstance(actual, tuple) + assert_equal(actual, [F.to_image(e) for e in expected]) + + @pytest.mark.parametrize("fn_or_class", [F.ten_crop, transforms.TenCrop]) + @pytest.mark.parametrize("vertical_flip", [False, True]) + def test_correctness_image_ten_crop(self, fn_or_class, vertical_flip): + if fn_or_class is transforms.TenCrop: + fn = transform_cls_to_functional(fn_or_class, size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) + kwargs = dict() + else: + fn = fn_or_class + kwargs = dict(size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) + + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = fn(image, **kwargs) + expected = F.ten_crop(F.to_pil_image(image), size=self.OUTPUT_SIZE, vertical_flip=vertical_flip) + + assert isinstance(actual, tuple) + assert_equal(actual, [F.to_image(e) for e in expected]) + + +class TestColorJitter: + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, dtype, device): + if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): + pytest.skip( + "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " + "will degenerate to that anyway." + ) + + # TODO needed to add seed after KeyPoints PR, not sure why? failure + # wasn't really significant anyway. + torch.manual_seed(1) + check_transform( + transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.25), + make_input(dtype=dtype, device=device), + ) + + def test_transform_noop(self): + input = make_image() + input_version = input._version + + transform = transforms.ColorJitter() + output = transform(input) + + assert output is input + assert output.data_ptr() == input.data_ptr() + assert output._version == input_version + + def test_transform_error(self): + with pytest.raises(ValueError, match="must be non negative"): + transforms.ColorJitter(brightness=-1) + + for brightness in [object(), [1, 2, 3]]: + with pytest.raises(TypeError, match="single number or a sequence with length 2"): + transforms.ColorJitter(brightness=brightness) + + with pytest.raises(ValueError, match="values should be between"): + transforms.ColorJitter(brightness=(-1, 0.5)) + + with pytest.raises(ValueError, match="values should be between"): + transforms.ColorJitter(hue=1) + + @pytest.mark.parametrize("brightness", [None, 0.1, (0.2, 0.3)]) + @pytest.mark.parametrize("contrast", [None, 0.4, (0.5, 0.6)]) + @pytest.mark.parametrize("saturation", [None, 0.7, (0.8, 0.9)]) + @pytest.mark.parametrize("hue", [None, 0.3, (-0.1, 0.2)]) + def test_transform_correctness(self, brightness, contrast, saturation, hue): + image = make_image(dtype=torch.uint8, device="cpu") + + transform = transforms.ColorJitter(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue) + + with freeze_rng_state(): + torch.manual_seed(0) + actual = transform(image) + + torch.manual_seed(0) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 + + +class TestRgbToGrayscale: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.rgb_to_grayscale_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image]) + def test_functional(self, make_input): + check_functional(F.rgb_to_grayscale, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.rgb_to_grayscale_image, torch.Tensor), + (F._color._rgb_to_grayscale_image_pil, PIL.Image.Image), + (F.rgb_to_grayscale_image, tv_tensors.Image), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.rgb_to_grayscale, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("transform", [transforms.Grayscale(), transforms.RandomGrayscale(p=1)]) + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image]) + def test_transform(self, transform, make_input): + check_transform(transform, make_input()) + + @pytest.mark.parametrize("num_output_channels", [1, 3]) + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + @pytest.mark.parametrize("fn", [F.rgb_to_grayscale, transform_cls_to_functional(transforms.Grayscale)]) + def test_image_correctness(self, num_output_channels, color_space, fn): + image = make_image(dtype=torch.uint8, device="cpu", color_space=color_space) + + actual = fn(image, num_output_channels=num_output_channels) + expected = F.to_image(F.rgb_to_grayscale(F.to_pil_image(image), num_output_channels=num_output_channels)) + + assert_equal(actual, expected, rtol=0, atol=1) + + def test_expanded_channels_are_not_views_into_the_same_underlying_tensor(self): + image = make_image(dtype=torch.uint8, device="cpu", color_space="GRAY") + + output_image = F.rgb_to_grayscale(image, num_output_channels=3) + assert_equal(output_image[0][0][0], output_image[1][0][0]) + output_image[0][0][0] = output_image[0][0][0] + 1 + assert output_image[0][0][0] != output_image[1][0][0] + + @pytest.mark.parametrize("num_input_channels", [1, 3]) + def test_random_transform_correctness(self, num_input_channels): + image = make_image( + color_space={ + 1: "GRAY", + 3: "RGB", + }[num_input_channels], + dtype=torch.uint8, + device="cpu", + ) + + transform = transforms.RandomGrayscale(p=1) + + actual = transform(image) + expected = F.to_image(F.rgb_to_grayscale(F.to_pil_image(image), num_output_channels=num_input_channels)) + + assert_equal(actual, expected, rtol=0, atol=1) + + +class TestGrayscaleToRgb: + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.grayscale_to_rgb_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image]) + def test_functional(self, make_input): + check_functional(F.grayscale_to_rgb, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.rgb_to_grayscale_image, torch.Tensor), + (F._color._rgb_to_grayscale_image_pil, PIL.Image.Image), + (F.rgb_to_grayscale_image, tv_tensors.Image), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.grayscale_to_rgb, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image]) + def test_transform(self, make_input): + check_transform(transforms.RGB(), make_input(color_space="GRAY")) + + @pytest.mark.parametrize("fn", [F.grayscale_to_rgb, transform_cls_to_functional(transforms.RGB)]) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu", color_space="GRAY") + + actual = fn(image) + expected = F.to_image(F.grayscale_to_rgb(F.to_pil_image(image))) + + assert_equal(actual, expected, rtol=0, atol=1) + + def test_expanded_channels_are_not_views_into_the_same_underlying_tensor(self): + image = make_image(dtype=torch.uint8, device="cpu", color_space="GRAY") + + output_image = F.grayscale_to_rgb(image) + assert_equal(output_image[0][0][0], output_image[1][0][0]) + output_image[0][0][0] = output_image[0][0][0] + 1 + assert output_image[0][0][0] != output_image[1][0][0] + + def test_rgb_image_is_unchanged(self): + image = make_image(dtype=torch.uint8, device="cpu", color_space="RGB") + assert_equal(image.shape[-3], 3) + assert_equal(F.grayscale_to_rgb(image), image) + + +class TestRandomZoomOut: + # Tests are light because this largely relies on the already tested `pad` kernels. + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + def test_transform(self, make_input): + check_transform(transforms.RandomZoomOut(p=1), make_input()) + + def test_transform_error(self): + for side_range in [None, 1, [1, 2, 3]]: + with pytest.raises( + ValueError if isinstance(side_range, list) else TypeError, match="should be a sequence of length 2" + ): + transforms.RandomZoomOut(side_range=side_range) + + for side_range in [[0.5, 1.5], [2.0, 1.0]]: + with pytest.raises(ValueError, match="Invalid side range"): + transforms.RandomZoomOut(side_range=side_range) + + @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_masks, + make_video, + ], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_params_correctness(self, side_range, make_input, device): + if make_input is make_image_pil and device != "cpu": + pytest.skip("PIL image tests with parametrization device!='cpu' will degenerate to that anyway.") + + transform = transforms.RandomZoomOut(side_range=side_range) + + input = make_input() + height, width = F.get_size(input) + + params = transform.make_params([input]) + assert "padding" in params + + padding = params["padding"] + assert len(padding) == 4 + + assert 0 <= padding[0] <= (side_range[1] - 1) * width + assert 0 <= padding[1] <= (side_range[1] - 1) * height + assert 0 <= padding[2] <= (side_range[1] - 1) * width + assert 0 <= padding[3] <= (side_range[1] - 1) * height + + +class TestRandomPhotometricDistort: + # Tests are light because this largely relies on the already tested + # `adjust_{brightness,contrast,saturation,hue}` and `permute_channels` kernels. + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, dtype, device): + if make_input is make_image_pil and not (dtype is torch.uint8 and device == "cpu"): + pytest.skip( + "PIL image tests with parametrization other than dtype=torch.uint8 and device='cpu' " + "will degenerate to that anyway." + ) + + check_transform( + transforms.RandomPhotometricDistort( + brightness=(0.3, 0.4), contrast=(0.5, 0.6), saturation=(0.7, 0.8), hue=(-0.1, 0.2), p=1 + ), + make_input(dtype=dtype, device=device), + ) + + +class TestScaleJitter: + # Tests are light because this largely relies on the already tested `resize` kernels. + + INPUT_SIZE = (17, 11) + TARGET_SIZE = (12, 13) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + if make_input is make_image_pil and device != "cpu": + pytest.skip("PIL image tests with parametrization device!='cpu' will degenerate to that anyway.") + + check_transform(transforms.ScaleJitter(self.TARGET_SIZE), make_input(self.INPUT_SIZE, device=device)) + + def test_make_params(self): + input_size = self.INPUT_SIZE + target_size = self.TARGET_SIZE + scale_range = (0.5, 1.5) + + transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) + params = transform.make_params([make_image(input_size)]) + + assert "size" in params + size = params["size"] + + assert isinstance(size, tuple) and len(size) == 2 + height, width = size + + r_min = min(target_size[1] / input_size[0], target_size[0] / input_size[1]) * scale_range[0] + r_max = min(target_size[1] / input_size[0], target_size[0] / input_size[1]) * scale_range[1] + + assert int(input_size[0] * r_min) <= height <= int(input_size[0] * r_max) + assert int(input_size[1] * r_min) <= width <= int(input_size[1] * r_max) + + +class TestLinearTransform: + def _make_matrix_and_vector(self, input, *, device=None): + device = device or input.device + numel = math.prod(F.get_dimensions(input)) + transformation_matrix = torch.randn((numel, numel), device=device) + mean_vector = torch.randn((numel,), device=device) + return transformation_matrix, mean_vector + + def _sample_input_adapter(self, transform, input, device): + return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)} + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, dtype, device): + input = make_input(dtype=dtype, device=device) + check_transform( + transforms.LinearTransformation(*self._make_matrix_and_vector(input)), + input, + check_sample_input=self._sample_input_adapter, + # Compat check is failing on M1 with: + # AssertionError: Tensor-likes are not close! + # Mismatched elements: 1 / 561 (0.2%) + # See https://github.com/pytorch/vision/issues/8453 + check_v1_compatibility=(sys.platform != "darwin"), + ) + + def test_transform_error(self): + with pytest.raises(ValueError, match="transformation_matrix should be square"): + transforms.LinearTransformation(transformation_matrix=torch.rand(2, 3), mean_vector=torch.rand(2)) + + with pytest.raises(ValueError, match="mean_vector should have the same length"): + transforms.LinearTransformation(transformation_matrix=torch.rand(2, 2), mean_vector=torch.rand(1)) + + for matrix_dtype, vector_dtype in [(torch.float32, torch.float64), (torch.float64, torch.float32)]: + with pytest.raises(ValueError, match="Input tensors should have the same dtype"): + transforms.LinearTransformation( + transformation_matrix=torch.rand(2, 2, dtype=matrix_dtype), + mean_vector=torch.rand(2, dtype=vector_dtype), + ) + + image = make_image() + transform = transforms.LinearTransformation(transformation_matrix=torch.rand(2, 2), mean_vector=torch.rand(2)) + with pytest.raises(ValueError, match="Input tensor and transformation matrix have incompatible shape"): + transform(image) + + transform = transforms.LinearTransformation(*self._make_matrix_and_vector(image)) + with pytest.raises(TypeError, match="does not support PIL images"): + transform(F.to_pil_image(image)) + + @needs_cuda + def test_transform_error_cuda(self): + for matrix_device, vector_device in [("cuda", "cpu"), ("cpu", "cuda")]: + with pytest.raises(ValueError, match="Input tensors should be on the same device"): + transforms.LinearTransformation( + transformation_matrix=torch.rand(2, 2, device=matrix_device), + mean_vector=torch.rand(2, device=vector_device), + ) + + for input_device, param_device in [("cuda", "cpu"), ("cpu", "cuda")]: + input = make_image(device=input_device) + transform = transforms.LinearTransformation(*self._make_matrix_and_vector(input, device=param_device)) + with pytest.raises( + ValueError, match="Input tensor should be on the same device as transformation matrix and mean vector" + ): + transform(input) + + +def make_image_numpy(*args, **kwargs): + image = make_image_tensor(*args, **kwargs) + return image.permute((1, 2, 0)).numpy() + + +class TestToImage: + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_image_numpy]) + @pytest.mark.parametrize("fn", [F.to_image, transform_cls_to_functional(transforms.ToImage)]) + def test_functional_and_transform(self, make_input, fn): + input = make_input() + output = fn(input) + + assert isinstance(output, tv_tensors.Image) + + input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input) + assert F.get_size(output) == input_size + + if isinstance(input, torch.Tensor): + assert output.data_ptr() == input.data_ptr() + + def test_2d_np_array(self): + # Non-regression test for https://github.com/pytorch/vision/issues/8255 + input = np.random.rand(10, 10) + assert F.to_image(input).shape == (1, 10, 10) + + def test_functional_error(self): + with pytest.raises(TypeError, match="Input can either be a pure Tensor, a numpy array, or a PIL image"): + F.to_image(object()) + + +class TestToPILImage: + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_image_numpy]) + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + @pytest.mark.parametrize("fn", [F.to_pil_image, transform_cls_to_functional(transforms.ToPILImage)]) + def test_functional_and_transform(self, make_input, color_space, fn): + input = make_input(color_space=color_space) + output = fn(input) + + assert isinstance(output, PIL.Image.Image) + + input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input) + assert F.get_size(output) == input_size + + def test_functional_error(self): + with pytest.raises(TypeError, match="pic should be Tensor or ndarray"): + F.to_pil_image(object()) + + for ndim in [1, 4]: + with pytest.raises(ValueError, match="pic should be 2/3 dimensional"): + F.to_pil_image(torch.empty(*[1] * ndim)) + + with pytest.raises(ValueError, match="pic should not have > 4 channels"): + num_channels = 5 + F.to_pil_image(torch.empty(num_channels, 1, 1)) + + +class TestToTensor: + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_image_numpy]) + def test_smoke(self, make_input): + with pytest.warns(UserWarning, match="deprecated and will be removed"): + transform = transforms.ToTensor() + + input = make_input() + output = transform(input) + + input_size = list(input.shape[:2]) if isinstance(input, np.ndarray) else F.get_size(input) + assert F.get_size(output) == input_size + + +class TestPILToTensor: + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + @pytest.mark.parametrize("fn", [F.pil_to_tensor, transform_cls_to_functional(transforms.PILToTensor)]) + def test_functional_and_transform(self, color_space, fn): + input = make_image_pil(color_space=color_space) + output = fn(input) + + assert isinstance(output, torch.Tensor) and not isinstance(output, tv_tensors.TVTensor) + assert F.get_size(output) == F.get_size(input) + + def test_functional_error(self): + with pytest.raises(TypeError, match="pic should be PIL Image"): + F.pil_to_tensor(object()) + + +class TestLambda: + @pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0]) + @pytest.mark.parametrize("types", [(), (torch.Tensor, np.ndarray)]) + def test_transform(self, input, types): + was_applied = False + + def was_applied_fn(input): + nonlocal was_applied + was_applied = True + return input + + transform = transforms.Lambda(was_applied_fn, *types) + output = transform(input) + + assert output is input + assert was_applied is (not types or isinstance(input, types)) + + +@pytest.mark.parametrize( + ("alias", "target"), + [ + pytest.param(alias, target, id=alias.__name__) + for alias, target in [ + (F.hflip, F.horizontal_flip), + (F.vflip, F.vertical_flip), + (F.get_image_num_channels, F.get_num_channels), + (F.to_pil_image, F.to_pil_image), + (F.elastic_transform, F.elastic), + (F.to_grayscale, F.rgb_to_grayscale), + ] + ], +) +def test_alias(alias, target): + assert alias is target + + +@pytest.mark.parametrize( + "make_inputs", + itertools.permutations( + [ + make_image_tensor, + make_image_tensor, + make_image_pil, + make_image, + make_video, + ], + 3, + ), +) +def test_pure_tensor_heuristic(make_inputs): + flat_inputs = [make_input() for make_input in make_inputs] + + def split_on_pure_tensor(to_split): + # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts: + # 1. The first pure tensor. If none is present, this will be `None` + # 2. A list of the remaining pure tensors + # 3. A list of all other items + pure_tensors = [] + others = [] + # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to + # affect the splitting. + for item, inpt in zip(to_split, flat_inputs): + (pure_tensors if is_pure_tensor(inpt) else others).append(item) + return pure_tensors[0] if pure_tensors else None, pure_tensors[1:], others + + class CopyCloneTransform(transforms.Transform): + def transform(self, inpt, params): + return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy() + + @staticmethod + def was_applied(output, inpt): + identity = output is inpt + if identity: + return False + + # Make sure nothing fishy is going on + assert_equal(output, inpt) + return True + + first_pure_tensor_input, other_pure_tensor_inputs, other_inputs = split_on_pure_tensor(flat_inputs) + + transform = CopyCloneTransform() + transformed_sample = transform(flat_inputs) + + first_pure_tensor_output, other_pure_tensor_outputs, other_outputs = split_on_pure_tensor(transformed_sample) + + if first_pure_tensor_input is not None: + if other_inputs: + assert not transform.was_applied(first_pure_tensor_output, first_pure_tensor_input) + else: + assert transform.was_applied(first_pure_tensor_output, first_pure_tensor_input) + + for output, inpt in zip(other_pure_tensor_outputs, other_pure_tensor_inputs): + assert not transform.was_applied(output, inpt) + + for input, output in zip(other_inputs, other_outputs): + assert transform.was_applied(output, input) + + +class TestRandomIoUCrop: + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]]) + def test_make_params(self, device, options): + orig_h, orig_w = size = (24, 32) + image = make_image(size) + bboxes = tv_tensors.BoundingBoxes( + torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), + format="XYXY", + canvas_size=size, + device=device, + ) + sample = [image, bboxes] + + transform = transforms.RandomIoUCrop(sampler_options=options) + + n_samples = 5 + for _ in range(n_samples): + + params = transform.make_params(sample) + + if options == [2.0]: + assert len(params) == 0 + return + + assert len(params["is_within_crop_area"]) > 0 + assert params["is_within_crop_area"].dtype == torch.bool + + assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h) + assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w) + + left, top = params["left"], params["top"] + new_h, new_w = params["height"], params["width"] + ious = box_iou( + bboxes, + torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device), + ) + assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}" + + def test__transform_empty_params(self, mocker): + transform = transforms.RandomIoUCrop(sampler_options=[2.0]) + image = tv_tensors.Image(torch.rand(1, 3, 4, 4)) + bboxes = tv_tensors.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4)) + label = torch.tensor([1]) + sample = [image, bboxes, label] + # Let's mock transform.make_params to control the output: + transform.make_params = mocker.MagicMock(return_value={}) + output = transform(sample) + torch.testing.assert_close(output, sample) + + def test_forward_assertion(self): + transform = transforms.RandomIoUCrop() + with pytest.raises( + TypeError, + match="requires input sample to contain tensor or PIL images and bounding boxes", + ): + transform(torch.tensor(0)) + + def test__transform(self, mocker): + transform = transforms.RandomIoUCrop() + + size = (32, 24) + image = make_image(size) + bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, num_boxes=6) + masks = make_detection_masks(size, num_masks=6) + + sample = [image, bboxes, masks] + + is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool) + + params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area) + transform.make_params = mocker.MagicMock(return_value=params) + output = transform(sample) + + # check number of bboxes vs number of labels: + output_bboxes = output[1] + assert isinstance(output_bboxes, tv_tensors.BoundingBoxes) + assert (output_bboxes[~is_within_crop_area] == 0).all() + + output_masks = output[2] + assert isinstance(output_masks, tv_tensors.Mask) + + +class TestRandomShortestSize: + @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)]) + def test_make_params(self, min_size, max_size): + canvas_size = (3, 10) + + transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True) + + sample = make_image(canvas_size) + params = transform.make_params([sample]) + + assert "size" in params + size = params["size"] + + assert isinstance(size, tuple) and len(size) == 2 + + longer = max(size) + shorter = min(size) + if max_size is not None: + assert longer <= max_size + assert shorter <= max_size + else: + assert shorter in min_size + + +class TestRandomResize: + def test_make_params(self): + min_size = 3 + max_size = 6 + + transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True) + + for _ in range(10): + params = transform.make_params([]) + + assert isinstance(params["size"], list) and len(params["size"]) == 1 + size = params["size"][0] + + assert min_size <= size < max_size + + +@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image)) +@pytest.mark.parametrize("label_type", (torch.Tensor, int)) +@pytest.mark.parametrize("dataset_return_type", (dict, tuple)) +@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) +def test_classification_preset(image_type, label_type, dataset_return_type, to_tensor): + + image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8)) + if image_type is PIL.Image: + image = to_pil_image(image[0]) + elif image_type is torch.Tensor: + image = image.as_subclass(torch.Tensor) + assert is_pure_tensor(image) + + label = 1 if label_type is int else torch.tensor([1]) + + if dataset_return_type is dict: + sample = { + "image": image, + "label": label, + } + else: + sample = image, label + + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + + t = transforms.Compose( + [ + transforms.RandomResizedCrop((224, 224), antialias=True), + transforms.RandomHorizontalFlip(p=1), + transforms.RandAugment(), + transforms.TrivialAugmentWide(), + transforms.AugMix(), + transforms.AutoAugment(), + to_tensor, + # TODO: ConvertImageDtype is a pass-through on PIL images, is that + # intended? This results in a failure if we convert to tensor after + # it, because the image would still be uint8 which make Normalize + # fail. + transforms.ConvertImageDtype(torch.float), + transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]), + transforms.RandomErasing(p=1), + ] + ) + + out = t(sample) + + assert type(out) == type(sample) + + if dataset_return_type is tuple: + out_image, out_label = out + else: + assert out.keys() == sample.keys() + out_image, out_label = out.values() + + assert out_image.shape[-2:] == (224, 224) + assert out_label == label + + +@pytest.mark.parametrize("input_size", [(17, 11), (11, 17), (11, 11)]) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_parallelogram_to_bounding_boxes(input_size, device): + # Assert that applying `_parallelogram_to_bounding_boxes` to rotated boxes + # does not modify the input. + bounding_boxes = make_bounding_boxes(input_size, format=tv_tensors.BoundingBoxFormat.XYXYXYXY, device=device) + actual = _parallelogram_to_bounding_boxes(bounding_boxes) + torch.testing.assert_close(actual, bounding_boxes, rtol=0, atol=1) + + # Test the transformation of two simple parallelograms. + # 1---2 1----2 + # / / -> | | + # 4---3 4----3 + + # 1---2 1----2 + # \ \ -> | | + # 4---3 4----3 + parallelogram = torch.tensor( + [[1, 0, 4, 0, 3, 2, 0, 2], [0, 0, 3, 0, 4, 2, 1, 2]], + dtype=torch.float32, + ) + expected = torch.tensor( + [ + [0, 0, 4, 0, 4, 2, 0, 2], + [0, 0, 4, 0, 4, 2, 0, 2], + ], + dtype=torch.float32, + ) + actual = _parallelogram_to_bounding_boxes(parallelogram) + torch.testing.assert_close(actual, expected) + + # Test the transformation of a simple parallelogram. + # 1 + # 1-2 / 2 + # / / -> / / + # 4-3 4 / + # 3 + # + # 1 + # 1-2 \ 2 + # \ \ -> \ \ + # 4-3 4 \ + # 3 + parallelogram = torch.tensor( + [[0, 4, 3, 1, 5, 1, 2, 4], [0, 1, 2, 1, 5, 4, 3, 4]], + dtype=torch.float32, + ) + expected = torch.tensor( + [[0, 4, 4, 0, 5, 1, 1, 5], [0, 1, 1, 0, 5, 4, 4, 5]], + dtype=torch.float32, + ) + actual = _parallelogram_to_bounding_boxes(parallelogram) + torch.testing.assert_close(actual, expected) + + +@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image)) +@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite")) +@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) +@pytest.mark.parametrize("sanitize", (True, False)) +def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): + torch.manual_seed(0) + + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + + if data_augmentation == "hflip": + t = [ + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "lsj": + t = [ + transforms.ScaleJitter(target_size=(1024, 1024), antialias=True), + # Note: replaced FixedSizeCrop with RandomCrop, becuase we're + # leaving FixedSizeCrop in prototype for now, and it expects Label + # classes which we won't release yet. + # transforms.FixedSizeCrop( + # size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {tv_tensors.Mask: 0}) + # ), + transforms.RandomCrop((1024, 1024), pad_if_needed=True), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "multiscale": + t = [ + transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True + ), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "ssd": + t = [ + transforms.RandomPhotometricDistort(p=1), + transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), tv_tensors.Mask: 0}, p=1), + transforms.RandomIoUCrop(), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "ssdlite": + t = [ + transforms.RandomIoUCrop(), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + if sanitize: + t += [transforms.SanitizeBoundingBoxes()] + t = transforms.Compose(t) + + num_boxes = 5 + H = W = 250 + + image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)) + if image_type is PIL.Image: + image = to_pil_image(image[0]) + elif image_type is torch.Tensor: + image = image.as_subclass(torch.Tensor) + assert is_pure_tensor(image) + + label = torch.randint(0, 10, size=(num_boxes,)) + + boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4)) + boxes[:, 2:] += boxes[:, :2] + boxes = boxes.clamp(min=0, max=min(H, W)) + boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) + + masks = tv_tensors.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8)) + + sample = { + "image": image, + "label": label, + "boxes": boxes, + "masks": masks, + } + + out = t(sample) + + if isinstance(to_tensor, transforms.ToTensor) and image_type is not tv_tensors.Image: + assert is_pure_tensor(out["image"]) + else: + assert isinstance(out["image"], tv_tensors.Image) + assert isinstance(out["label"], type(sample["label"])) + + num_boxes_expected = { + # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It + # doesn't remove them strictly speaking, it just marks some boxes as + # degenerate and those boxes will be later removed by + # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize + # param is True. + # Note that the values below are probably specific to the random seed + # set above (which is fine). + (True, "ssd"): 5, + (True, "ssdlite"): 4, + }.get((sanitize, data_augmentation), num_boxes) + + assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected + + +class TestSanitizeBoundingBoxes: + def _get_boxes_and_valid_mask(self, H=256, W=128, min_size=10, min_area=10): + boxes_and_validity = [ + ([0, 1, 10, 1], False), # Y1 == Y2 + ([0, 1, 0, 20], False), # X1 == X2 + ([0, 0, min_size - 1, 10], False), # H < min_size + ([0, 0, 10, min_size - 1], False), # W < min_size + ([0, 0, 10, H + 1], False), # Y2 > H + ([0, 0, W + 1, 10], False), # X2 > W + ([-1, 1, 10, 20], False), # any < 0 + ([0, 0, -1, 20], False), # any < 0 + ([0, 0, -10, -1], False), # any < 0 + ([0, 0, min_size, 10], min_size * 10 >= min_area), # H < min_size + ([0, 0, 10, min_size], min_size * 10 >= min_area), # W < min_size + ([0, 0, W, H], W * H >= min_area), + ([1, 1, 30, 20], 29 * 19 >= min_area), + ([0, 0, 10, 10], 9 * 9 >= min_area), + ([1, 1, 30, 20], 29 * 19 >= min_area), + ] + + random.shuffle(boxes_and_validity) # For test robustness: mix order of wrong and correct cases + boxes, expected_valid_mask = zip(*boxes_and_validity) + boxes = tv_tensors.BoundingBoxes( + boxes, + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(H, W), + ) + + return boxes, expected_valid_mask + + @pytest.mark.parametrize("min_size, min_area", ((1, 1), (10, 1), (10, 101))) + @pytest.mark.parametrize( + "labels_getter", + ( + "default", + lambda inputs: inputs["labels"], + lambda inputs: (inputs["labels"], inputs["other_labels"]), + lambda inputs: [inputs["labels"], inputs["other_labels"]], + None, + lambda inputs: None, + ), + ) + @pytest.mark.parametrize("sample_type", (tuple, dict)) + def test_transform(self, min_size, min_area, labels_getter, sample_type): + + if sample_type is tuple and not isinstance(labels_getter, str): + # The "lambda inputs: inputs["labels"]" labels_getter used in this test + # doesn't work if the input is a tuple. + return + + H, W = 256, 128 + boxes, expected_valid_mask = self._get_boxes_and_valid_mask(H=H, W=W, min_size=min_size, min_area=min_area) + valid_indices = [i for (i, is_valid) in enumerate(expected_valid_mask) if is_valid] + + labels = torch.arange(boxes.shape[0]) + masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) + # other_labels corresponds to properties from COCO like iscrowd, area... + # We only sanitize it when labels_getter returns a tuple + other_labels = torch.arange(boxes.shape[0]) + whatever = torch.rand(10) + input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) + sample = { + "image": input_img, + "labels": labels, + "boxes": boxes, + "other_labels": other_labels, + "whatever": whatever, + "None": None, + "masks": masks, + } + + if sample_type is tuple: + img = sample.pop("image") + sample = (img, sample) + + out = transforms.SanitizeBoundingBoxes(min_size=min_size, min_area=min_area, labels_getter=labels_getter)( + sample + ) + + if sample_type is tuple: + out_image = out[0] + out_labels = out[1]["labels"] + out_other_labels = out[1]["other_labels"] + out_boxes = out[1]["boxes"] + out_masks = out[1]["masks"] + out_whatever = out[1]["whatever"] + else: + out_image = out["image"] + out_labels = out["labels"] + out_other_labels = out["other_labels"] + out_boxes = out["boxes"] + out_masks = out["masks"] + out_whatever = out["whatever"] + + assert out_image is input_img + assert out_whatever is whatever + + assert isinstance(out_boxes, tv_tensors.BoundingBoxes) + assert isinstance(out_masks, tv_tensors.Mask) + + if labels_getter is None or (callable(labels_getter) and labels_getter(sample) is None): + assert out_labels is labels + assert out_other_labels is other_labels + else: + assert isinstance(out_labels, torch.Tensor) + assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0] + # This works because we conveniently set labels to arange(num_boxes) + assert out_labels.tolist() == valid_indices + + if callable(labels_getter) and isinstance(labels_getter(sample), (tuple, list)): + assert_equal(out_other_labels, out_labels) + else: + assert_equal(out_other_labels, other_labels) + + @pytest.mark.parametrize("input_type", (torch.Tensor, tv_tensors.BoundingBoxes)) + def test_functional(self, input_type): + # Note: the "functional" F.sanitize_bounding_boxes was added after the class, so there is some + # redundancy with test_transform() in terms of correctness checks. But that's OK. + + H, W, min_size = 256, 128, 10 + + boxes, expected_valid_mask = self._get_boxes_and_valid_mask(H=H, W=W, min_size=min_size) + + if input_type is tv_tensors.BoundingBoxes: + format = canvas_size = None + else: + # just passing "XYXY" explicitly to make sure we support strings + format, canvas_size = "XYXY", boxes.canvas_size + boxes = boxes.as_subclass(torch.Tensor) + + boxes, valid = F.sanitize_bounding_boxes(boxes, format=format, canvas_size=canvas_size, min_size=min_size) + + assert_equal(valid, torch.tensor(expected_valid_mask)) + assert type(valid) == torch.Tensor + assert boxes.shape[0] == sum(valid) + assert isinstance(boxes, input_type) + + def test_kernel(self): + H, W, min_size = 256, 128, 10 + boxes, _ = self._get_boxes_and_valid_mask(H=H, W=W, min_size=min_size) + + format, canvas_size = boxes.format, boxes.canvas_size + boxes = boxes.as_subclass(torch.Tensor) + + check_kernel( + F.sanitize_bounding_boxes, + input=boxes, + format=format, + canvas_size=canvas_size, + check_batched_vs_unbatched=False, + ) + + def test_no_label(self): + # Non-regression test for https://github.com/pytorch/vision/issues/7878 + + img = make_image() + boxes = make_bounding_boxes() + + with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"): + transforms.SanitizeBoundingBoxes()(img, boxes) + + out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes) + assert isinstance(out_img, tv_tensors.Image) + assert isinstance(out_boxes, tv_tensors.BoundingBoxes) + + def test_errors_transform(self): + good_bbox = tv_tensors.BoundingBoxes( + [[0, 0, 10, 10]], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(20, 20), + ) + + with pytest.raises(ValueError, match="min_size must be >= 1"): + transforms.SanitizeBoundingBoxes(min_size=0) + with pytest.raises(ValueError, match="min_area must be >= 1"): + transforms.SanitizeBoundingBoxes(min_area=0) + with pytest.raises(ValueError, match="labels_getter should either be 'default'"): + transforms.SanitizeBoundingBoxes(labels_getter=12) + + with pytest.raises(ValueError, match="Could not infer where the labels are"): + bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} + transforms.SanitizeBoundingBoxes()(bad_labels_key) + + with pytest.raises(ValueError, match="must be a tensor"): + not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} + transforms.SanitizeBoundingBoxes()(not_a_tensor) + + with pytest.raises(ValueError, match="Number of boxes"): + different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} + transforms.SanitizeBoundingBoxes()(different_sizes) + + def test_errors_functional(self): + + good_bbox = tv_tensors.BoundingBoxes( + [[0, 0, 10, 10]], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(20, 20), + ) + + with pytest.raises(ValueError, match="canvas_size cannot be None if bounding_boxes is a pure tensor"): + F.sanitize_bounding_boxes(good_bbox.as_subclass(torch.Tensor), format="XYXY", canvas_size=None) + + with pytest.raises(ValueError, match="canvas_size cannot be None if bounding_boxes is a pure tensor"): + F.sanitize_bounding_boxes(good_bbox.as_subclass(torch.Tensor), format=None, canvas_size=(10, 10)) + + with pytest.raises(ValueError, match="canvas_size must be None when bounding_boxes is a tv_tensors"): + F.sanitize_bounding_boxes(good_bbox, format="XYXY", canvas_size=None) + + with pytest.raises(ValueError, match="canvas_size must be None when bounding_boxes is a tv_tensors"): + F.sanitize_bounding_boxes(good_bbox, format="XYXY", canvas_size=None) + + with pytest.raises(ValueError, match="bounding_boxes must be a tv_tensors.BoundingBoxes instance or a"): + F.sanitize_bounding_boxes(good_bbox.tolist()) + + +class TestJPEG: + @pytest.mark.parametrize("quality", [5, 75]) + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + def test_kernel_image(self, quality, color_space): + check_kernel(F.jpeg_image, make_image(color_space=color_space), quality=quality) + + def test_kernel_video(self): + check_kernel(F.jpeg_video, make_video(), quality=5) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.jpeg, make_input(), quality=5) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.jpeg_image, torch.Tensor), + (F._augment._jpeg_image_pil, PIL.Image.Image), + (F.jpeg_image, tv_tensors.Image), + (F.jpeg_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.jpeg, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + @pytest.mark.parametrize("quality", [5, (10, 20)]) + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + def test_transform(self, make_input, quality, color_space): + check_transform(transforms.JPEG(quality=quality), make_input(color_space=color_space)) + + @pytest.mark.parametrize("quality", [5]) + def test_functional_image_correctness(self, quality): + image = make_image() + + actual = F.jpeg(image, quality=quality) + expected = F.to_image(F.jpeg(F.to_pil_image(image), quality=quality)) + + # NOTE: this will fail if torchvision and Pillow use different JPEG encoder/decoder + torch.testing.assert_close(actual, expected, rtol=0, atol=1) + + @pytest.mark.parametrize("quality", [5, (10, 20)]) + @pytest.mark.parametrize("color_space", ["RGB", "GRAY"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, quality, color_space, seed): + image = make_image(color_space=color_space) + + transform = transforms.JPEG(quality=quality) + + with freeze_rng_state(): + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected, rtol=0, atol=1) + + @pytest.mark.parametrize("quality", [5, (10, 20)]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transformmake_params_bounds(self, quality, seed): + transform = transforms.JPEG(quality=quality) + + with freeze_rng_state(): + torch.manual_seed(seed) + params = transform.make_params([]) + + if isinstance(quality, int): + assert params["quality"] == quality + else: + assert quality[0] <= params["quality"] <= quality[1] + + @pytest.mark.parametrize("quality", [[0], [0, 0, 0]]) + def test_transform_sequence_len_error(self, quality): + with pytest.raises(ValueError, match="quality should be a sequence of length 2"): + transforms.JPEG(quality=quality) + + @pytest.mark.parametrize("quality", [-1, 0, 150]) + def test_transform_invalid_quality_error(self, quality): + with pytest.raises(ValueError, match="quality must be an integer from 1 to 100"): + transforms.JPEG(quality=quality) + + @pytest.mark.parametrize("quality", [None, True]) + def test_transform_quality_type_error(self, quality): + with pytest.raises(TypeError, match="quality"): + transforms.JPEG(quality=quality) + + +class TestUtils: + # TODO: Still need to test has_all, has_any, check_type and get_bouding_boxes + @pytest.mark.parametrize( + "make_input1", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask] + ) + @pytest.mark.parametrize( + "make_input2", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask] + ) + @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw]) + def test_query_size_and_query_chw(self, make_input1, make_input2, query): + size = (32, 64) + input1 = make_input1(size) + input2 = make_input2(size) + + if query is transforms.query_chw and not any( + transforms.check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video)) + for inpt in (input1, input2) + ): + return + + expected = size if query is transforms.query_size else ((3,) + size) + assert query([input1, input2]) == expected + + @pytest.mark.parametrize( + "make_input1", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask] + ) + @pytest.mark.parametrize( + "make_input2", [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask] + ) + @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw]) + def test_different_sizes(self, make_input1, make_input2, query): + input1 = make_input1((10, 10)) + input2 = make_input2((20, 20)) + if query is transforms.query_chw and not all( + transforms.check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video)) + for inpt in (input1, input2) + ): + return + with pytest.raises(ValueError, match="Found multiple"): + query([input1, input2]) + + @pytest.mark.parametrize("query", [transforms.query_size, transforms.query_chw]) + def test_no_valid_input(self, query): + with pytest.raises(TypeError, match="No image"): + query(["blah"]) diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py new file mode 100644 index 00000000000..dab6d525a38 --- /dev/null +++ b/test/test_transforms_v2_utils.py @@ -0,0 +1,102 @@ +import PIL.Image +import pytest + +import torch + +import torchvision.transforms.v2._utils +from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_masks, make_image, make_keypoints + +from torchvision import tv_tensors +from torchvision.transforms.v2._utils import has_all, has_any +from torchvision.transforms.v2.functional import to_pil_image + + +IMAGE = make_image(DEFAULT_SIZE, color_space="RGB") +BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=tv_tensors.BoundingBoxFormat.XYXY) +MASK = make_detection_masks(DEFAULT_SIZE) +KEYPOINTS = make_keypoints(DEFAULT_SIZE) + + +@pytest.mark.parametrize( + ("sample", "types", "expected"), + [ + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image,), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Mask,), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.KeyPoints,), True), + ((MASK,), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.KeyPoints), False), + ((BOUNDING_BOX,), (tv_tensors.Image, tv_tensors.Mask, tv_tensors.KeyPoints), False), + ((IMAGE,), (tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), False), + ((KEYPOINTS,), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ( + (IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), + (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), + True, + ), + ((), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), False), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (lambda obj: isinstance(obj, tv_tensors.Image),), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (lambda _: False,), False), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (lambda _: True,), True), + ((IMAGE,), (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True), + ( + (torch.Tensor(IMAGE),), + (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), + True, + ), + ( + (to_pil_image(IMAGE),), + (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), + True, + ), + ], +) +def test_has_any(sample, types, expected): + assert has_any(sample, *types) is expected + + +@pytest.mark.parametrize( + ("sample", "types", "expected"), + [ + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image,), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Mask,), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Image, tv_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.Mask, tv_tensors.KeyPoints), True), + ((IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), (tv_tensors.BoundingBoxes, tv_tensors.KeyPoints), True), + ( + (IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), + (tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), + True, + ), + ( + (IMAGE, BOUNDING_BOX, MASK, KEYPOINTS), + (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask, tv_tensors.KeyPoints), + True, + ), + ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), False), + ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), False), + ((IMAGE, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ( + (IMAGE, BOUNDING_BOX, MASK), + (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), + True, + ), + ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ((IMAGE, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ((IMAGE, BOUNDING_BOX), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ( + (IMAGE, BOUNDING_BOX, MASK), + (lambda obj: isinstance(obj, (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask)),), + True, + ), + ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), + ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True), + ], +) +def test_has_all(sample, types, expected): + assert has_all(sample, *types) is expected diff --git a/test/test_transforms_video.py b/test/test_transforms_video.py index 21594868f09..4ad57e6a98e 100644 --- a/test/test_transforms_video.py +++ b/test/test_transforms_video.py @@ -23,8 +23,8 @@ def test_random_crop_video(self): numFrames = random.randint(4, 128) height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 - oheight = random.randint(5, (height - 2) / 2) * 2 - owidth = random.randint(5, (width - 2) / 2) * 2 + oheight = random.randint(5, (height - 2) // 2) * 2 + owidth = random.randint(5, (width - 2) // 2) * 2 clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8) result = Compose( [ @@ -41,8 +41,8 @@ def test_random_resized_crop_video(self): numFrames = random.randint(4, 128) height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 - oheight = random.randint(5, (height - 2) / 2) * 2 - owidth = random.randint(5, (width - 2) / 2) * 2 + oheight = random.randint(5, (height - 2) // 2) * 2 + owidth = random.randint(5, (width - 2) // 2) * 2 clip = torch.randint(0, 256, (numFrames, height, width, 3), dtype=torch.uint8) result = Compose( [ @@ -59,8 +59,8 @@ def test_center_crop_video(self): numFrames = random.randint(4, 128) height = random.randint(10, 32) * 2 width = random.randint(10, 32) * 2 - oheight = random.randint(5, (height - 2) / 2) * 2 - owidth = random.randint(5, (width - 2) / 2) * 2 + oheight = random.randint(5, (height - 2) // 2) * 2 + owidth = random.randint(5, (width - 2) // 2) * 2 clip = torch.ones((numFrames, height, width, 3), dtype=torch.uint8) * 255 oh1 = (height - oheight) // 2 diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py new file mode 100644 index 00000000000..f9d545eb9c9 --- /dev/null +++ b/test/test_tv_tensors.py @@ -0,0 +1,445 @@ +from copy import deepcopy + +import pytest +import torch +from common_utils import ( + assert_equal, + make_bounding_boxes, + make_image, + make_keypoints, + make_segmentation_mask, + make_video, +) +from PIL import Image + +from torchvision import tv_tensors + + +@pytest.fixture(autouse=True) +def restore_tensor_return_type(): + # This is for security, as we should already be restoring the default manually in each test anyway + # (at least at the time of writing...) + yield + tv_tensors.set_return_type("Tensor") + + +@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)]) +def test_image_instance(data): + image = tv_tensors.Image(data) + assert isinstance(image, torch.Tensor) + assert image.ndim == 3 and image.shape[0] == 3 + + +@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)]) +def test_mask_instance(data): + mask = tv_tensors.Mask(data) + assert isinstance(mask, torch.Tensor) + assert mask.ndim == 3 and mask.shape[0] == 1 + + +@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]]) +@pytest.mark.parametrize( + "format", ["XYXY", "CXCYWH", tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH] +) +def test_bbox_instance(data, format): + bboxes = tv_tensors.BoundingBoxes(data, format=format, canvas_size=(32, 32)) + assert isinstance(bboxes, torch.Tensor) + assert bboxes.ndim == 2 and bboxes.shape[1] == 4 + if isinstance(format, str): + format = tv_tensors.BoundingBoxFormat[(format.upper())] + assert bboxes.format == format + + +@pytest.mark.parametrize( + "format, is_rotated_expected", + [ + ("XYXY", False), + ("XYWH", False), + ("CXCYWH", False), + ("XYXYXYXY", True), + ("XYWHR", True), + ("CXCYWHR", True), + (tv_tensors.BoundingBoxFormat.XYXY, False), + (tv_tensors.BoundingBoxFormat.XYWH, False), + (tv_tensors.BoundingBoxFormat.CXCYWH, False), + (tv_tensors.BoundingBoxFormat.XYXYXYXY, True), + (tv_tensors.BoundingBoxFormat.XYWHR, True), + (tv_tensors.BoundingBoxFormat.CXCYWHR, True), + ], +) +@pytest.mark.parametrize("scripted", (False, True)) +def test_bbox_format(format, is_rotated_expected, scripted): + fn = tv_tensors.is_rotated_bounding_format + if scripted: + fn = torch.jit.script(fn) + assert fn(format) == is_rotated_expected + + +@pytest.mark.parametrize( + "format, support_integer_dtype", + [ + ("XYXY", True), + ("XYWH", True), + ("CXCYWH", True), + ("XYXYXYXY", False), + ("XYWHR", False), + ("CXCYWHR", False), + (tv_tensors.BoundingBoxFormat.XYXY, True), + (tv_tensors.BoundingBoxFormat.XYWH, True), + (tv_tensors.BoundingBoxFormat.CXCYWH, True), + (tv_tensors.BoundingBoxFormat.XYXYXYXY, False), + (tv_tensors.BoundingBoxFormat.XYWHR, False), + (tv_tensors.BoundingBoxFormat.CXCYWHR, False), + ], +) +@pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) +def test_bbox_format_dtype(format, support_integer_dtype, input_dtype): + tensor = torch.randint(0, 32, size=(5, 2), dtype=input_dtype) + if not input_dtype.is_floating_point and not support_integer_dtype: + with pytest.raises(ValueError, match="Rotated bounding boxes should be floating point tensors"): + tv_tensors.BoundingBoxes(tensor, format=format, canvas_size=(32, 32)) + else: + tv_tensors.BoundingBoxes(tensor, format=format, canvas_size=(32, 32)) + + +def test_bbox_dim_error(): + data_3d = [[[1, 2, 3, 4]]] + with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"): + tv_tensors.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32)) + + +@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 2)), [[0, 0], [2, 2]], [1, 2]]) +def test_keypoints_instance(data): + kpoint = tv_tensors.KeyPoints(data, canvas_size=(32, 32)) + assert isinstance(kpoint, torch.Tensor) + assert type(kpoint) is tv_tensors.KeyPoints + assert kpoint.shape[-1] == 2 + + +def test_keypoints_shape_error(): + with pytest.raises(ValueError, match="Expected a tensor of shape"): + tv_tensors.KeyPoints(torch.tensor([[1, 2, 3]]), canvas_size=(11, 7)) + + +@pytest.mark.parametrize( + ("data", "input_requires_grad", "expected_requires_grad"), + [ + ([[[0.0, 1.0], [0.0, 1.0]]], None, False), + ([[[0.0, 1.0], [0.0, 1.0]]], False, False), + ([[[0.0, 1.0], [0.0, 1.0]]], True, True), + (torch.rand(3, 16, 16, requires_grad=False), None, False), + (torch.rand(3, 16, 16, requires_grad=False), False, False), + (torch.rand(3, 16, 16, requires_grad=False), True, True), + (torch.rand(3, 16, 16, requires_grad=True), None, True), + (torch.rand(3, 16, 16, requires_grad=True), False, False), + (torch.rand(3, 16, 16, requires_grad=True), True, True), + ], +) +def test_new_requires_grad(data, input_requires_grad, expected_requires_grad): + tv_tensor = tv_tensors.Image(data, requires_grad=input_requires_grad) + assert tv_tensor.requires_grad is expected_requires_grad + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +def test_isinstance(make_input): + assert isinstance(make_input(), torch.Tensor) + + +def test_wrapping_no_copy(): + tensor = torch.rand(3, 16, 16) + image = tv_tensors.Image(tensor) + + assert image.data_ptr() == tensor.data_ptr() + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +def test_to_wrapping(make_input): + dp = make_input() + + dp_to = dp.to(torch.float64) + + assert type(dp_to) is type(dp) + assert dp_to.dtype is torch.float64 + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_to_tv_tensor_reference(make_input, return_type): + tensor = torch.rand((3, 16, 16), dtype=torch.float64) + dp = make_input() + + with tv_tensors.set_return_type(return_type): + tensor_to = tensor.to(dp) + + assert type(tensor_to) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + assert tensor_to.dtype is dp.dtype + assert type(tensor) is torch.Tensor + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_clone_wrapping(make_input, return_type): + dp = make_input() + + with tv_tensors.set_return_type(return_type): + dp_clone = dp.clone() + + assert type(dp_clone) is type(dp) + assert dp_clone.data_ptr() != dp.data_ptr() + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_requires_grad__wrapping(make_input, return_type): + dp = make_input(dtype=torch.float) + + assert not dp.requires_grad + + with tv_tensors.set_return_type(return_type): + dp_requires_grad = dp.requires_grad_(True) + + assert type(dp_requires_grad) is type(dp) + assert dp.requires_grad + assert dp_requires_grad.requires_grad + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_detach_wrapping(make_input, return_type): + dp = make_input(dtype=torch.float).requires_grad_(True) + + with tv_tensors.set_return_type(return_type): + dp_detached = dp.detach() + + assert type(dp_detached) is type(dp) + + +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_force_subclass_with_metadata(return_type): + # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and tv_tensors with metadata + # Largely the same as above, we additionally check that the metadata is preserved + format, canvas_size = "XYXY", (32, 32) + bbox = tv_tensors.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size) + kpoints = tv_tensors.KeyPoints([[0, 0], [2, 2]], canvas_size=canvas_size) + + tv_tensors.set_return_type(return_type) + bbox = bbox.clone() + kpoints = kpoints.clone() + if return_type == "TVTensor": + assert kpoints.canvas_size == canvas_size + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + bbox = bbox.to(torch.float64) + kpoints = kpoints.to(torch.float64) + if return_type == "TVTensor": + assert kpoints.canvas_size == canvas_size + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + bbox = bbox.detach() + kpoints = kpoints.detach() + if return_type == "TVTensor": + assert kpoints.canvas_size == canvas_size + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + if torch.cuda.is_available(): + bbox = bbox.pin_memory() + if return_type == "TVTensor": + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + assert not bbox.requires_grad + assert not kpoints.requires_grad + bbox.requires_grad_(True) + kpoints.requires_grad_(True) + if return_type == "TVTensor": + assert bbox.format, bbox.canvas_size == (format, canvas_size) + assert bbox.requires_grad + assert kpoints.canvas_size == canvas_size + assert kpoints.requires_grad + tv_tensors.set_return_type("tensor") + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_other_op_no_wrapping(make_input, return_type): + dp = make_input() + + with tv_tensors.set_return_type(return_type): + # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here + output = dp * 2 + + assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize( + "op", + [ + lambda t: t.numpy(), + lambda t: t.tolist(), + lambda t: t.max(dim=-1), + ], +) +def test_no_tensor_output_op_no_wrapping(make_input, op): + dp = make_input() + + output = op(dp) + + assert type(output) is not type(dp) + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_inplace_op_no_wrapping(make_input, return_type): + dp = make_input() + original_type = type(dp) + + with tv_tensors.set_return_type(return_type): + output = dp.add_(0) + + assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + assert type(dp) is original_type + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +def test_wrap(make_input): + dp = make_input() + + # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here + output = dp * 2 + + dp_new = tv_tensors.wrap(output, like=dp) + + assert type(dp_new) is type(dp) + assert dp_new.data_ptr() == output.data_ptr() + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("requires_grad", [False, True]) +def test_deepcopy(make_input, requires_grad): + dp = make_input(dtype=torch.float) + + dp.requires_grad_(requires_grad) + + dp_deepcopied = deepcopy(dp) + + assert dp_deepcopied is not dp + assert dp_deepcopied.data_ptr() != dp.data_ptr() + assert_equal(dp_deepcopied, dp) + + assert type(dp_deepcopied) is type(dp) + assert dp_deepcopied.requires_grad is requires_grad + + +@pytest.mark.parametrize( + "make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video, make_keypoints] +) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +@pytest.mark.parametrize( + "op", + ( + lambda dp: dp + torch.rand(*dp.shape), + lambda dp: torch.rand(*dp.shape) + dp, + lambda dp: dp * torch.rand(*dp.shape), + lambda dp: torch.rand(*dp.shape) * dp, + lambda dp: dp + 3, + lambda dp: 3 + dp, + lambda dp: dp + dp, + lambda dp: dp.sum(), + lambda dp: dp.reshape(-1), + lambda dp: dp.int(), + lambda dp: torch.stack([dp, dp]), + lambda dp: torch.chunk(dp, 2)[0], + lambda dp: torch.unbind(dp)[0], + ), +) +def test_usual_operations(make_input, return_type, op): + + dp = make_input() + with tv_tensors.set_return_type(return_type): + out = op(dp) + assert type(out) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "TVTensor": + assert hasattr(out, "format") + assert hasattr(out, "canvas_size") + + +def test_subclasses(): + img = make_image() + masks = make_segmentation_mask() + + with pytest.raises(TypeError, match="unsupported operand"): + img + masks + + +def test_set_return_type(): + img = make_image() + + assert type(img + 3) is torch.Tensor + + with tv_tensors.set_return_type("TVTensor"): + assert type(img + 3) is tv_tensors.Image + assert type(img + 3) is torch.Tensor + + tv_tensors.set_return_type("TVTensor") + assert type(img + 3) is tv_tensors.Image + + with tv_tensors.set_return_type("tensor"): + assert type(img + 3) is torch.Tensor + with tv_tensors.set_return_type("TVTensor"): + assert type(img + 3) is tv_tensors.Image + tv_tensors.set_return_type("tensor") + assert type(img + 3) is torch.Tensor + assert type(img + 3) is torch.Tensor + # Exiting a context manager will restore the return type as it was prior to entering it, + # regardless of whether the "global" tv_tensors.set_return_type() was called within the context manager. + assert type(img + 3) is tv_tensors.Image + + tv_tensors.set_return_type("tensor") + + +def test_return_type_input(): + img = make_image() + + # Case-insensitive + with tv_tensors.set_return_type("tvtensor"): + assert type(img + 3) is tv_tensors.Image + + with pytest.raises(ValueError, match="return_type must be"): + tv_tensors.set_return_type("typo") + + tv_tensors.set_return_type("tensor") + + +def test_box_clamping_mode_default_and_error(): + assert ( + tv_tensors.BoundingBoxes([0.0, 0.0, 10.0, 10.0], format="XYXY", canvas_size=(100, 100)).clamping_mode == "soft" + ) + assert ( + tv_tensors.BoundingBoxes([0.0, 0.0, 10.0, 10.0, 0.0], format="XYWHR", canvas_size=(100, 100)).clamping_mode + == "soft" + ) + + with pytest.raises(ValueError, match="clamping_mode must be"): + tv_tensors.BoundingBoxes([0, 0, 10, 10], format="XYXY", canvas_size=(100, 100), clamping_mode="bad") diff --git a/test/test_utils.py b/test/test_utils.py index dde3ee90dc3..8b6f357ce6e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -9,14 +9,33 @@ import torch import torchvision.transforms.functional as F import torchvision.utils as utils -from common_utils import assert_equal +from common_utils import assert_equal, cpu_and_cuda from PIL import __version__ as PILLOW_VERSION, Image, ImageColor +from torchvision.transforms.v2.functional import to_dtype PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split(".")) boxes = torch.tensor([[0, 0, 20, 20], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float) - +rotated_boxes = torch.tensor( + [ + [100, 150, 150, 150, 150, 250, 100, 250], + [200, 350, 250, 350, 250, 250, 200, 250], + [300, 200, 200, 200, 200, 250, 300, 250], + # Not really a rectangle, but it doesn't matter + [ + 100, + 100, + 200, + 50, + 290, + 350, + 200, + 400, + ], + ], + dtype=torch.float, +) keypoints = torch.tensor([[[10, 10], [5, 5], [2, 2]], [[20, 20], [30, 30], [3, 3]]], dtype=torch.float) @@ -105,7 +124,7 @@ def test_draw_boxes(): res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy()) res.save(path) - if PILLOW_VERSION >= (8, 2): + if PILLOW_VERSION >= (10, 1): # The reference image is only valid for new PIL versions expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) assert_equal(result, expected) @@ -115,11 +134,85 @@ def test_draw_boxes(): assert_equal(img, img_cp) +@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1") +def test_draw_boxes_with_coloured_labels(): + img = torch.full((3, 100, 100), 255, dtype=torch.uint8) + labels = ["a", "b", "c", "d"] + colors = ["green", "#FF00FF", (0, 255, 0), "red"] + label_colors = ["green", "red", (0, 255, 0), "#FF00FF"] + result = utils.draw_bounding_boxes(img, boxes, labels=labels, colors=colors, fill=True, label_colors=label_colors) + + path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_boxes_different_label_colors.png" + ) + expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) + assert_equal(result, expected) + + +@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1") +def test_draw_boxes_with_coloured_label_backgrounds(): + img = torch.full((3, 100, 100), 255, dtype=torch.uint8) + labels = ["a", "b", "c", "d"] + colors = ["green", "#FF00FF", (0, 255, 0), "red"] + label_colors = ["green", "red", (0, 255, 0), "#FF00FF"] + result = utils.draw_bounding_boxes( + img, boxes, labels=labels, colors=colors, fill=True, label_colors=label_colors, fill_labels=True + ) + + path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_boxes_different_label_fill_colors.png" + ) + expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) + assert_equal(result, expected) + + +@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1") +def test_draw_rotated_boxes(): + img = torch.full((3, 500, 500), 255, dtype=torch.uint8) + colors = ["blue", "yellow", (0, 255, 0), "black"] + + result = utils.draw_bounding_boxes(img, rotated_boxes, colors=colors) + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_rotated_boxes.png") + expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) + assert_equal(result, expected) + + +@pytest.mark.skipif(PILLOW_VERSION < (10, 1), reason="The reference image is only valid for PIL >= 10.1") +def test_draw_rotated_boxes_fill(): + img = torch.full((3, 500, 500), 255, dtype=torch.uint8) + colors = ["blue", "yellow", (0, 255, 0), "black"] + + result = utils.draw_bounding_boxes(img, rotated_boxes, colors=colors, fill=True) + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_rotated_boxes_fill.png") + expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) + assert_equal(result, expected) + + +@pytest.mark.parametrize("fill", [True, False]) +def test_draw_boxes_dtypes(fill): + img_uint8 = torch.full((3, 100, 100), 255, dtype=torch.uint8) + out_uint8 = utils.draw_bounding_boxes(img_uint8, boxes, fill=fill) + + assert img_uint8 is not out_uint8 + assert out_uint8.dtype == torch.uint8 + + img_float = to_dtype(img_uint8, torch.float, scale=True) + out_float = utils.draw_bounding_boxes(img_float, boxes, fill=fill) + + assert img_float is not out_float + assert out_float.is_floating_point() + + torch.testing.assert_close(out_uint8, to_dtype(out_float, torch.uint8, scale=True), rtol=0, atol=1) + + @pytest.mark.parametrize("colors", [None, ["red", "blue", "#FF00FF", (1, 34, 122)], "red", "#FF00FF", (1, 34, 122)]) def test_draw_boxes_colors(colors): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors) + with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"): + utils.draw_bounding_boxes(image=img, boxes=boxes, colors=[]) + def test_draw_boxes_vanilla(): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) @@ -148,7 +241,6 @@ def test_draw_boxes_grayscale(): def test_draw_invalid_boxes(): img_tp = ((1, 1, 1), (1, 2, 3)) - img_wrong1 = torch.full((3, 5, 5), 255, dtype=torch.float) img_wrong2 = torch.full((1, 3, 5, 5), 255, dtype=torch.uint8) img_correct = torch.zeros((3, 10, 10), dtype=torch.uint8) boxes = torch.tensor([[0, 0, 20, 20], [0, 0, 0, 0], [10, 15, 30, 35], [23, 35, 93, 95]], dtype=torch.float) @@ -158,8 +250,6 @@ def test_draw_invalid_boxes(): with pytest.raises(TypeError, match="Tensor expected"): utils.draw_bounding_boxes(img_tp, boxes) - with pytest.raises(ValueError, match="Tensor uint8 expected"): - utils.draw_bounding_boxes(img_wrong1, boxes) with pytest.raises(ValueError, match="Pass individual images, not batches"): utils.draw_bounding_boxes(img_wrong2, boxes) with pytest.raises(ValueError, match="Only grayscale and RGB images are supported"): @@ -184,7 +274,7 @@ def test_draw_no_boxes(): boxes = torch.full((0, 4), 0, dtype=torch.float) with pytest.warns(UserWarning, match=re.escape("boxes doesn't contain any box. No box was drawn")): res = utils.draw_bounding_boxes(img, boxes) - # Check that the function didnt change the image + # Check that the function didn't change the image assert res.eq(img).all() @@ -200,19 +290,17 @@ def test_draw_no_boxes(): ], ) @pytest.mark.parametrize("alpha", (0, 0.5, 0.7, 1)) -def test_draw_segmentation_masks(colors, alpha): +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_draw_segmentation_masks(colors, alpha, device): """This test makes sure that masks draw their corresponding color where they should""" num_masks, h, w = 2, 100, 100 dtype = torch.uint8 - img = torch.randint(0, 256, size=(3, h, w), dtype=dtype) - masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool) + img = torch.randint(0, 256, size=(3, h, w), dtype=dtype, device=device) + masks = torch.zeros((num_masks, h, w), dtype=torch.bool, device=device) + masks[0, 10:20, 10:20] = True + masks[1, 15:25, 15:25] = True - # For testing we enforce that there's no overlap between the masks. The - # current behaviour is that the last mask's color will take priority when - # masks overlap, but this makes testing slightly harder so we don't really - # care overlap = masks[0] & masks[1] - masks[:, overlap] = False out = utils.draw_segmentation_masks(img, masks, colors=colors, alpha=alpha) assert out.dtype == dtype @@ -231,22 +319,46 @@ def test_draw_segmentation_masks(colors, alpha): for mask, color in zip(masks, colors): if isinstance(color, str): color = ImageColor.getrgb(color) - color = torch.tensor(color, dtype=dtype) + color = torch.tensor(color, dtype=dtype, device=device) if alpha == 1: - assert (out[:, mask] == color[:, None]).all() + assert (out[:, mask & ~overlap] == color[:, None]).all() elif alpha == 0: - assert (out[:, mask] == img[:, mask]).all() + assert (out[:, mask & ~overlap] == img[:, mask & ~overlap]).all() - interpolated_color = (img[:, mask] * (1 - alpha) + color[:, None] * alpha).to(dtype) - torch.testing.assert_close(out[:, mask], interpolated_color, rtol=0.0, atol=1.0) + interpolated_color = (img[:, mask & ~overlap] * (1 - alpha) + color[:, None] * alpha).to(dtype) + torch.testing.assert_close(out[:, mask & ~overlap], interpolated_color, rtol=0.0, atol=1.0) + interpolated_overlap = (img[:, overlap] * (1 - alpha)).to(dtype) + torch.testing.assert_close(out[:, overlap], interpolated_overlap, rtol=0.0, atol=1.0) -def test_draw_segmentation_masks_errors(): + +def test_draw_segmentation_masks_dtypes(): + num_masks, h, w = 2, 100, 100 + + masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool) + + img_uint8 = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8) + out_uint8 = utils.draw_segmentation_masks(img_uint8, masks) + + assert img_uint8 is not out_uint8 + assert out_uint8.dtype == torch.uint8 + + img_float = to_dtype(img_uint8, torch.float, scale=True) + out_float = utils.draw_segmentation_masks(img_float, masks) + + assert img_float is not out_float + assert out_float.is_floating_point() + + torch.testing.assert_close(out_uint8, to_dtype(out_float, torch.uint8, scale=True), rtol=0, atol=1) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_draw_segmentation_masks_errors(device): h, w = 10, 10 - masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool) - img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8) + masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool, device=device) + img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8, device=device) with pytest.raises(TypeError, match="The image must be a tensor"): utils.draw_segmentation_masks(image="Not A Tensor Image", masks=masks) @@ -268,22 +380,23 @@ def test_draw_segmentation_masks_errors(): with pytest.raises(ValueError, match="must have the same height and width"): masks_bad_shape = torch.randint(0, 2, size=(h + 4, w), dtype=torch.bool) utils.draw_segmentation_masks(image=img, masks=masks_bad_shape) - with pytest.raises(ValueError, match="There are more masks"): + with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"): utils.draw_segmentation_masks(image=img, masks=masks, colors=[]) - with pytest.raises(ValueError, match="colors must be a tuple or a string, or a list thereof"): + with pytest.raises(ValueError, match="`colors` must be a tuple or a string, or a list thereof"): bad_colors = np.array(["red", "blue"]) # should be a list utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors) - with pytest.raises(ValueError, match="It seems that you passed a tuple of colors instead of"): + with pytest.raises(ValueError, match="If passed as tuple, colors should be an RGB triplet"): bad_colors = ("red", "blue") # should be a list utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors) -def test_draw_no_segmention_mask(): - img = torch.full((3, 100, 100), 0, dtype=torch.uint8) - masks = torch.full((0, 100, 100), 0, dtype=torch.bool) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_draw_no_segmention_mask(device): + img = torch.full((3, 100, 100), 0, dtype=torch.uint8, device=device) + masks = torch.full((0, 100, 100), 0, dtype=torch.bool, device=device) with pytest.warns(UserWarning, match=re.escape("masks doesn't contain any mask. No mask was drawn")): res = utils.draw_segmentation_masks(img, masks) - # Check that the function didnt change the image + # Check that the function didn't change the image assert res.eq(img).all() @@ -314,6 +427,13 @@ def test_draw_keypoints_vanilla(): assert_equal(img, img_cp) +def test_draw_keypoins_K_equals_one(): + # Non-regression test for https://github.com/pytorch/vision/pull/8439 + img = torch.full((3, 100, 100), 0, dtype=torch.uint8) + keypoints = torch.tensor([[[10, 10]]], dtype=torch.float) + utils.draw_keypoints(img, keypoints) + + @pytest.mark.parametrize("colors", ["red", "#FF00FF", (1, 34, 122)]) def test_draw_keypoints_colored(colors): # Keypoints is declared on top as global variable @@ -334,6 +454,93 @@ def test_draw_keypoints_colored(colors): assert_equal(img, img_cp) +@pytest.mark.parametrize("connectivity", [[(0, 1)], [(0, 1), (1, 2)]]) +@pytest.mark.parametrize( + "vis", + [ + torch.tensor([[1, 1, 0], [1, 1, 0]], dtype=torch.bool), + torch.tensor([[1, 1, 0], [1, 1, 0]], dtype=torch.float).unsqueeze_(-1), + ], +) +def test_draw_keypoints_visibility(connectivity, vis): + # Keypoints is declared on top as global variable + keypoints_cp = keypoints.clone() + + img = torch.full((3, 100, 100), 0, dtype=torch.uint8) + img_cp = img.clone() + + vis_cp = vis if vis is None else vis.clone() + + result = utils.draw_keypoints( + image=img, + keypoints=keypoints, + connectivity=connectivity, + colors="red", + visibility=vis, + ) + assert result.size(0) == 3 + assert_equal(keypoints, keypoints_cp) + assert_equal(img, img_cp) + + # compare with a fakedata image + # connect the key points 0 to 1 for both skeletons and do not show the other key points + path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_keypoints_visibility.png" + ) + if not os.path.exists(path): + res = Image.fromarray(result.permute(1, 2, 0).contiguous().numpy()) + res.save(path) + + expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) + assert_equal(result, expected) + + if vis_cp is None: + assert vis is None + else: + assert_equal(vis, vis_cp) + assert vis.dtype == vis_cp.dtype + + +def test_draw_keypoints_visibility_default(): + # Keypoints is declared on top as global variable + keypoints_cp = keypoints.clone() + + img = torch.full((3, 100, 100), 0, dtype=torch.uint8) + img_cp = img.clone() + + result = utils.draw_keypoints( + image=img, + keypoints=keypoints, + connectivity=[(0, 1)], + colors="red", + visibility=None, + ) + assert result.size(0) == 3 + assert_equal(keypoints, keypoints_cp) + assert_equal(img, img_cp) + + # compare against fakedata image, which connects 0->1 for both key-point skeletons + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "fakedata", "draw_keypoint_vanilla.png") + expected = torch.as_tensor(np.array(Image.open(path))).permute(2, 0, 1) + assert_equal(result, expected) + + +def test_draw_keypoints_dtypes(): + image_uint8 = torch.randint(0, 256, size=(3, 100, 100), dtype=torch.uint8) + image_float = to_dtype(image_uint8, torch.float, scale=True) + + out_uint8 = utils.draw_keypoints(image_uint8, keypoints) + out_float = utils.draw_keypoints(image_float, keypoints) + + assert out_uint8.dtype == torch.uint8 + assert out_uint8 is not image_uint8 + + assert out_float.is_floating_point() + assert out_float is not image_float + + torch.testing.assert_close(out_uint8, to_dtype(out_float, torch.uint8, scale=True), rtol=0, atol=1) + + def test_draw_keypoints_errors(): h, w = 10, 10 img = torch.full((3, 100, 100), 0, dtype=torch.uint8) @@ -352,6 +559,18 @@ def test_draw_keypoints_errors(): with pytest.raises(ValueError, match="keypoints must be of shape"): invalid_keypoints = torch.tensor([[10, 10, 10, 10], [5, 6, 7, 8]], dtype=torch.float) utils.draw_keypoints(image=img, keypoints=invalid_keypoints) + with pytest.raises(ValueError, match=re.escape("visibility must be of shape (num_instances, K)")): + one_dim_visibility = torch.tensor([True, True, True], dtype=torch.bool) + utils.draw_keypoints(image=img, keypoints=keypoints, visibility=one_dim_visibility) + with pytest.raises(ValueError, match=re.escape("visibility must be of shape (num_instances, K)")): + three_dim_visibility = torch.ones((2, 3, 4), dtype=torch.bool) + utils.draw_keypoints(image=img, keypoints=keypoints, visibility=three_dim_visibility) + with pytest.raises(ValueError, match="keypoints and visibility must have the same dimensionality"): + vis_wrong_n = torch.ones((3, 3), dtype=torch.bool) + utils.draw_keypoints(image=img, keypoints=keypoints, visibility=vis_wrong_n) + with pytest.raises(ValueError, match="keypoints and visibility must have the same dimensionality"): + vis_wrong_k = torch.ones((2, 4), dtype=torch.bool) + utils.draw_keypoints(image=img, keypoints=keypoints, visibility=vis_wrong_k) @pytest.mark.parametrize("batch", (True, False)) @@ -369,7 +588,7 @@ def test_flow_to_image(batch): assert img.shape == (2, 3, h, w) if batch else (3, h, w) path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "expected_flow.pt") - expected_img = torch.load(path, map_location="cpu") + expected_img = torch.load(path, map_location="cpu", weights_only=True) if batch: expected_img = torch.stack([expected_img, expected_img]) diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py index d987db6ddeb..aa6d0aee9e0 100644 --- a/test/test_video_gpu_decoder.py +++ b/test/test_video_gpu_decoder.py @@ -3,6 +3,7 @@ import pytest import torch +import torchvision from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader try: @@ -29,8 +30,9 @@ class TestVideoGPUDecoder: ], ) def test_frame_reading(self, video_file): + torchvision.set_video_backend("cuda") full_path = os.path.join(VIDEO_DIR, video_file) - decoder = VideoReader(full_path, device="cuda") + decoder = VideoReader(full_path) with av.open(full_path) as container: for av_frame in container.decode(container.streams.video[0]): av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) @@ -54,7 +56,8 @@ def test_frame_reading(self, video_file): ], ) def test_seek_reading(self, keyframes, full_path, duration): - decoder = VideoReader(full_path, device="cuda") + torchvision.set_video_backend("cuda") + decoder = VideoReader(full_path) time = duration / 2 decoder.seek(time, keyframes_only=keyframes) with av.open(full_path) as container: @@ -79,8 +82,9 @@ def test_seek_reading(self, keyframes, full_path, duration): ], ) def test_metadata(self, video_file): + torchvision.set_video_backend("cuda") full_path = os.path.join(VIDEO_DIR, video_file) - decoder = VideoReader(full_path, device="cuda") + decoder = VideoReader(full_path) video_metadata = decoder.get_metadata()["video"] with av.open(full_path) as container: video = container.streams.video[0] diff --git a/test/test_video_reader.py b/test/test_video_reader.py index 867923d10d0..10995424982 100644 --- a/test/test_video_reader.py +++ b/test/test_video_reader.py @@ -11,7 +11,7 @@ from numpy.random import randint from pytest import approx from torchvision import set_video_backend -from torchvision.io import _HAS_VIDEO_OPT +from torchvision.io import _HAS_CPU_VIDEO_DECODER try: @@ -127,7 +127,7 @@ def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer ascending order. We need to decode more frames even when we meet end pts """ - # seeking in the stream is imprecise. Thus, seek to an ealier PTS by a margin + # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin margin = 1 seek_offset = max(start_pts - margin, 0) @@ -263,7 +263,7 @@ def _get_video_tensor(video_dir, video_file): @pytest.mark.skipif(av is None, reason="PyAV unavailable") -@pytest.mark.skipif(_HAS_VIDEO_OPT is False, reason="Didn't compile with ffmpeg") +@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg") class TestVideoReader: def check_separate_decoding_result(self, tv_result, config): """check the decoding results from TorchVision decoder""" diff --git a/test/test_videoapi.py b/test/test_videoapi.py index 895b9b83555..aabcf6407f7 100644 --- a/test/test_videoapi.py +++ b/test/test_videoapi.py @@ -7,7 +7,13 @@ import torchvision from pytest import approx from torchvision.datasets.utils import download_url -from torchvision.io import _HAS_VIDEO_OPT, VideoReader +from torchvision.io import _HAS_CPU_VIDEO_DECODER, VideoReader + + +# WARNING: these tests have been skipped forever on the CI because the video ops +# are never properly available. This is bad, but things have been in a terrible +# state for a long time already as we write this comment, and we'll hopefully be +# able to get rid of this all soon. try: @@ -25,6 +31,13 @@ GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig)) +def backends(): + backends_ = ["video_reader"] + if av is not None: + backends_.append("pyav") + return backends_ + + def fate(name, path="."): """Download and return a path to a sample from the FFmpeg test suite. See the `FFmpeg Automated Test Environment `_ @@ -49,11 +62,13 @@ def fate(name, path="."): } -@pytest.mark.skipif(_HAS_VIDEO_OPT is False, reason="Didn't compile with ffmpeg") +@pytest.mark.skipif(_HAS_CPU_VIDEO_DECODER is False, reason="Didn't compile with ffmpeg") class TestVideoApi: @pytest.mark.skipif(av is None, reason="PyAV unavailable") @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_frame_reading(self, test_video): + @pytest.mark.parametrize("backend", backends()) + def test_frame_reading(self, test_video, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) with av.open(full_path) as av_reader: if av_reader.streams.video: @@ -77,6 +92,7 @@ def test_frame_reading(self, test_video): # compare the frames and ptss for i in range(len(vr_frames)): assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) + mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) # on average the difference is very small and caused # by decoding (around 1%) @@ -114,12 +130,62 @@ def test_frame_reading(self, test_video): # we assure that there is never more than 1% difference in signal assert max_delta.item() < 0.001 + @pytest.mark.parametrize("stream", ["video", "audio"]) + @pytest.mark.parametrize("test_video", test_videos.keys()) + @pytest.mark.parametrize("backend", backends()) + def test_frame_reading_mem_vs_file(self, test_video, stream, backend): + torchvision.set_video_backend(backend) + full_path = os.path.join(VIDEO_DIR, test_video) + + reader = VideoReader(full_path) + reader_md = reader.get_metadata() + + if stream in reader_md: + # Test video reading from file vs from memory + vr_frames, vr_frames_mem = [], [] + vr_pts, vr_pts_mem = [], [] + # get vr frames + video_reader = VideoReader(full_path, stream) + for vr_frame in video_reader: + vr_frames.append(vr_frame["data"]) + vr_pts.append(vr_frame["pts"]) + + # get vr frames = read from memory + f = open(full_path, "rb") + fbytes = f.read() + f.close() + video_reader_from_mem = VideoReader(fbytes, stream) + + for vr_frame_from_mem in video_reader_from_mem: + vr_frames_mem.append(vr_frame_from_mem["data"]) + vr_pts_mem.append(vr_frame_from_mem["pts"]) + + # same number of frames + assert len(vr_frames) == len(vr_frames_mem) + assert len(vr_pts) == len(vr_pts_mem) + + # compare the frames and ptss + for i in range(len(vr_frames)): + assert vr_pts[i] == vr_pts_mem[i] + mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float())) + # on average the difference is very small and caused + # by decoding (around 1%) + # TODO: asses empirically how to set this? atm it's 1% + # averaged over all frames + assert mean_delta.item() < 2.55 + + del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem + else: + del reader, reader_md + @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_metadata(self, test_video, config): + @pytest.mark.parametrize("backend", backends()) + def test_metadata(self, test_video, config, backend): """ Test that the metadata returned via pyav corresponds to the one returned by the new video decoder API """ + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) reader = VideoReader(full_path, "video") reader_md = reader.get_metadata() @@ -127,7 +193,9 @@ def test_metadata(self, test_video, config): assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5) @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_seek_start(self, test_video): + @pytest.mark.parametrize("backend", backends()) + def test_seek_start(self, test_video, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) video_reader = VideoReader(full_path, "video") num_frames = 0 @@ -153,7 +221,9 @@ def test_seek_start(self, test_video): assert start_num_frames == num_frames @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_accurateseek_middle(self, test_video): + @pytest.mark.parametrize("backend", ["video_reader"]) + def test_accurateseek_middle(self, test_video, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) stream = "video" video_reader = VideoReader(full_path, stream) @@ -192,7 +262,9 @@ def test_fate_suite(self): @pytest.mark.skipif(av is None, reason="PyAV unavailable") @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_keyframe_reading(self, test_video, config): + @pytest.mark.parametrize("backend", backends()) + def test_keyframe_reading(self, test_video, config, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) av_reader = av.open(full_path) @@ -227,6 +299,14 @@ def test_keyframe_reading(self, test_video, config): for i in range(len(av_keyframes)): assert av_keyframes[i] == approx(vr_keyframes[i], rel=0.001) + def test_src(self): + with pytest.raises(ValueError, match="src cannot be empty"): + VideoReader(src="") + with pytest.raises(ValueError, match="src must be either string"): + VideoReader(src=2) + with pytest.raises(TypeError, match="unexpected keyword argument"): + VideoReader(path="path") + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/tracing/frcnn/CMakeLists.txt b/test/tracing/frcnn/CMakeLists.txt deleted file mode 100644 index c79382470bd..00000000000 --- a/test/tracing/frcnn/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project(test_frcnn_tracing) - -find_package(Torch REQUIRED) -find_package(TorchVision REQUIRED) - -# This due to some headers importing Python.h -find_package(Python3 COMPONENTS Development) - -add_executable(test_frcnn_tracing test_frcnn_tracing.cpp) -target_compile_features(test_frcnn_tracing PUBLIC cxx_range_for) -target_link_libraries(test_frcnn_tracing ${TORCH_LIBRARIES} TorchVision::TorchVision Python3::Python) -set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 14) diff --git a/test/tracing/frcnn/test_frcnn_tracing.cpp b/test/tracing/frcnn/test_frcnn_tracing.cpp deleted file mode 100644 index f5f350b6b02..00000000000 --- a/test/tracing/frcnn/test_frcnn_tracing.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include - - -int main() { - torch::DeviceType device_type; - device_type = torch::kCPU; - - torch::jit::script::Module module; - try { - std::cout << "Loading model\n"; - // Deserialize the ScriptModule from a file using torch::jit::load(). - module = torch::jit::load("fasterrcnn_resnet50_fpn.pt"); - std::cout << "Model loaded\n"; - } catch (const torch::Error& e) { - std::cout << "error loading the model\n"; - return -1; - } catch (const std::exception& e) { - std::cout << "Other error: " << e.what() << "\n"; - return -1; - } - - // TorchScript models require a List[IValue] as input - std::vector inputs; - - // Faster RCNN accepts a List[Tensor] as main input - std::vector images; - images.push_back(torch::rand({3, 256, 275})); - images.push_back(torch::rand({3, 256, 275})); - - inputs.push_back(images); - auto output = module.forward(inputs); - - std::cout << "ok\n"; - std::cout << "output" << output << "\n"; - - if (torch::cuda::is_available()) { - // Move traced model to GPU - module.to(torch::kCUDA); - - // Add GPU inputs - images.clear(); - inputs.clear(); - - torch::TensorOptions options = torch::TensorOptions{torch::kCUDA}; - images.push_back(torch::rand({3, 256, 275}, options)); - images.push_back(torch::rand({3, 256, 275}, options)); - - inputs.push_back(images); - auto output = module.forward(inputs); - - std::cout << "ok\n"; - std::cout << "output" << output << "\n"; - } - return 0; -} diff --git a/test/tracing/frcnn/trace_model.py b/test/tracing/frcnn/trace_model.py deleted file mode 100644 index b5ec50bdab1..00000000000 --- a/test/tracing/frcnn/trace_model.py +++ /dev/null @@ -1,13 +0,0 @@ -import os.path as osp - -import torch -import torchvision - -HERE = osp.dirname(osp.abspath(__file__)) -ASSETS = osp.dirname(osp.dirname(HERE)) - -model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None) -model.eval() - -traced_model = torch.jit.script(model) -traced_model.save("fasterrcnn_resnet50_fpn.pt") diff --git a/torchvision/__init__.py b/torchvision/__init__.py index 739f79407b3..5d06156c25f 100644 --- a/torchvision/__init__.py +++ b/torchvision/__init__.py @@ -1,16 +1,20 @@ import os import warnings +from modulefinder import Module import torch -from torchvision import datasets, io, models, ops, transforms, utils -from .extension import _HAS_OPS +# Don't re-order these, we need to load the _C extension (done when importing +# .extensions) before entering _meta_registrations. +from .extension import _HAS_OPS # usort:skip +from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils # usort:skip try: from .version import __version__ # noqa: F401 except ImportError: pass + # Check if torchvision is being imported within the root folder if not _HAS_OPS and os.path.dirname(os.path.realpath(__file__)) == os.path.join( os.path.realpath(os.getcwd()), "torchvision" @@ -66,11 +70,16 @@ def set_video_backend(backend): backend, please compile torchvision from source. """ global _video_backend - if backend not in ["pyav", "video_reader"]: - raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend) - if backend == "video_reader" and not io._HAS_VIDEO_OPT: + if backend not in ["pyav", "video_reader", "cuda"]: + raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend) + if backend == "video_reader" and not io._HAS_CPU_VIDEO_DECODER: + # TODO: better messages message = "video_reader video backend is not available. Please compile torchvision from source and try again" - warnings.warn(message) + raise RuntimeError(message) + elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER: + # TODO: better messages + message = "cuda video backend is not available." + raise RuntimeError(message) else: _video_backend = backend @@ -88,3 +97,9 @@ def get_video_backend(): def _is_tracing(): return torch._C._get_tracing_state() + + +def disable_beta_transforms_warning(): + # Noop, only exists to avoid breaking existing code. + # See https://github.com/pytorch/vision/issues/7896 + pass diff --git a/torchvision/_internally_replaced_utils.py b/torchvision/_internally_replaced_utils.py index 18afc3ed93a..e0fa72489f1 100644 --- a/torchvision/_internally_replaced_utils.py +++ b/torchvision/_internally_replaced_utils.py @@ -6,6 +6,7 @@ _HOME = os.path.join(_get_torch_home(), "datasets", "vision") _USE_SHARDED_DATASETS = False +IN_FBCODE = False def _download_file_from_remote_location(fpath: str, url: str) -> None: @@ -28,7 +29,6 @@ def _get_extension_path(lib_name): if os.name == "nt": # Register the main torchvision library location on the default DLL path import ctypes - import sys kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) with_load_library_flags = hasattr(kernel32, "AddDllDirectory") @@ -37,14 +37,7 @@ def _get_extension_path(lib_name): if with_load_library_flags: kernel32.AddDllDirectory.restype = ctypes.c_void_p - if sys.version_info >= (3, 8): - os.add_dll_directory(lib_dir) - elif with_load_library_flags: - res = kernel32.AddDllDirectory(lib_dir) - if res is None: - err = ctypes.WinError(ctypes.get_last_error()) - err.strerror += f' Error adding "{lib_dir}" to the DLL directories.' - raise err + os.add_dll_directory(lib_dir) kernel32.SetErrorMode(prev_error_mode) diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py new file mode 100644 index 00000000000..f75bfb77a7f --- /dev/null +++ b/torchvision/_meta_registrations.py @@ -0,0 +1,225 @@ +import functools + +import torch +import torch._custom_ops +import torch.library + +# Ensure that torch.ops.torchvision is visible +import torchvision.extension # noqa: F401 + + +@functools.lru_cache(None) +def get_meta_lib(): + return torch.library.Library("torchvision", "IMPL", "Meta") + + +def register_meta(op_name, overload_name="default"): + def wrapper(fn): + if torchvision.extension._has_ops(): + get_meta_lib().impl(getattr(getattr(torch.ops.torchvision, op_name), overload_name), fn) + return fn + + return wrapper + + +@register_meta("roi_align") +def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): + torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]") + torch._check( + input.dtype == rois.dtype, + lambda: ( + "Expected tensor for input to have the same type as tensor for rois; " + f"but type {input.dtype} does not equal {rois.dtype}" + ), + ) + num_rois = rois.size(0) + channels = input.size(1) + return input.new_empty((num_rois, channels, pooled_height, pooled_width)) + + +@register_meta("_roi_align_backward") +def meta_roi_align_backward( + grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned +): + torch._check( + grad.dtype == rois.dtype, + lambda: ( + "Expected tensor for grad to have the same type as tensor for rois; " + f"but type {grad.dtype} does not equal {rois.dtype}" + ), + ) + return grad.new_empty((batch_size, channels, height, width)) + + +@register_meta("ps_roi_align") +def meta_ps_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio): + torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]") + torch._check( + input.dtype == rois.dtype, + lambda: ( + "Expected tensor for input to have the same type as tensor for rois; " + f"but type {input.dtype} does not equal {rois.dtype}" + ), + ) + channels = input.size(1) + torch._check( + channels % (pooled_height * pooled_width) == 0, + "input channels must be a multiple of pooling height * pooling width", + ) + + num_rois = rois.size(0) + out_size = (num_rois, channels // (pooled_height * pooled_width), pooled_height, pooled_width) + return input.new_empty(out_size), torch.empty(out_size, dtype=torch.int32, device="meta") + + +@register_meta("_ps_roi_align_backward") +def meta_ps_roi_align_backward( + grad, + rois, + channel_mapping, + spatial_scale, + pooled_height, + pooled_width, + sampling_ratio, + batch_size, + channels, + height, + width, +): + torch._check( + grad.dtype == rois.dtype, + lambda: ( + "Expected tensor for grad to have the same type as tensor for rois; " + f"but type {grad.dtype} does not equal {rois.dtype}" + ), + ) + return grad.new_empty((batch_size, channels, height, width)) + + +@register_meta("roi_pool") +def meta_roi_pool(input, rois, spatial_scale, pooled_height, pooled_width): + torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]") + torch._check( + input.dtype == rois.dtype, + lambda: ( + "Expected tensor for input to have the same type as tensor for rois; " + f"but type {input.dtype} does not equal {rois.dtype}" + ), + ) + num_rois = rois.size(0) + channels = input.size(1) + out_size = (num_rois, channels, pooled_height, pooled_width) + return input.new_empty(out_size), torch.empty(out_size, device="meta", dtype=torch.int32) + + +@register_meta("_roi_pool_backward") +def meta_roi_pool_backward( + grad, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width +): + torch._check( + grad.dtype == rois.dtype, + lambda: ( + "Expected tensor for grad to have the same type as tensor for rois; " + f"but type {grad.dtype} does not equal {rois.dtype}" + ), + ) + return grad.new_empty((batch_size, channels, height, width)) + + +@register_meta("ps_roi_pool") +def meta_ps_roi_pool(input, rois, spatial_scale, pooled_height, pooled_width): + torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]") + torch._check( + input.dtype == rois.dtype, + lambda: ( + "Expected tensor for input to have the same type as tensor for rois; " + f"but type {input.dtype} does not equal {rois.dtype}" + ), + ) + channels = input.size(1) + torch._check( + channels % (pooled_height * pooled_width) == 0, + "input channels must be a multiple of pooling height * pooling width", + ) + num_rois = rois.size(0) + out_size = (num_rois, channels // (pooled_height * pooled_width), pooled_height, pooled_width) + return input.new_empty(out_size), torch.empty(out_size, device="meta", dtype=torch.int32) + + +@register_meta("_ps_roi_pool_backward") +def meta_ps_roi_pool_backward( + grad, rois, channel_mapping, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width +): + torch._check( + grad.dtype == rois.dtype, + lambda: ( + "Expected tensor for grad to have the same type as tensor for rois; " + f"but type {grad.dtype} does not equal {rois.dtype}" + ), + ) + return grad.new_empty((batch_size, channels, height, width)) + + +@torch.library.register_fake("torchvision::nms") +def meta_nms(dets, scores, iou_threshold): + torch._check(dets.dim() == 2, lambda: f"boxes should be a 2d tensor, got {dets.dim()}D") + torch._check(dets.size(1) == 4, lambda: f"boxes should have 4 elements in dimension 1, got {dets.size(1)}") + torch._check(scores.dim() == 1, lambda: f"scores should be a 1d tensor, got {scores.dim()}") + torch._check( + dets.size(0) == scores.size(0), + lambda: f"boxes and scores should have same number of elements in dimension 0, got {dets.size(0)} and {scores.size(0)}", + ) + ctx = torch._custom_ops.get_ctx() + num_to_keep = ctx.create_unbacked_symint() + return dets.new_empty(num_to_keep, dtype=torch.long) + + +@register_meta("deform_conv2d") +def meta_deform_conv2d( + input, + weight, + offset, + mask, + bias, + stride_h, + stride_w, + pad_h, + pad_w, + dil_h, + dil_w, + n_weight_grps, + n_offset_grps, + use_mask, +): + + out_height, out_width = offset.shape[-2:] + out_channels = weight.shape[0] + batch_size = input.shape[0] + return input.new_empty((batch_size, out_channels, out_height, out_width)) + + +@register_meta("_deform_conv2d_backward") +def meta_deform_conv2d_backward( + grad, + input, + weight, + offset, + mask, + bias, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + groups, + offset_groups, + use_mask, +): + + grad_input = input.new_empty(input.shape) + grad_weight = weight.new_empty(weight.shape) + grad_offset = offset.new_empty(offset.shape) + grad_mask = mask.new_empty(mask.shape) + grad_bias = bias.new_empty(bias.shape) + return grad_input, grad_weight, grad_offset, grad_mask, grad_bias diff --git a/torchvision/_utils.py b/torchvision/_utils.py index b739ef0966e..aee2676df45 100644 --- a/torchvision/_utils.py +++ b/torchvision/_utils.py @@ -1,5 +1,6 @@ import enum -from typing import Sequence, Type, TypeVar +from collections.abc import Sequence +from typing import TypeVar T = TypeVar("T", bound=enum.Enum) @@ -7,7 +8,7 @@ class StrEnumMeta(enum.EnumMeta): auto = enum.auto - def from_str(self: Type[T], member: str) -> T: # type: ignore[misc] + def from_str(self: type[T], member: str) -> T: # type: ignore[misc] try: return self[member] except KeyError: diff --git a/torchvision/csrc/io/decoder/audio_sampler.cpp b/torchvision/csrc/io/decoder/audio_sampler.cpp index e26d788d9c7..648955c5845 100644 --- a/torchvision/csrc/io/decoder/audio_sampler.cpp +++ b/torchvision/csrc/io/decoder/audio_sampler.cpp @@ -48,6 +48,23 @@ bool AudioSampler::init(const SamplerParameters& params) { return false; } +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) + SwrContext* swrContext_ = NULL; + AVChannelLayout channel_out; + AVChannelLayout channel_in; + av_channel_layout_default(&channel_out, params.out.audio.channels); + av_channel_layout_default(&channel_in, params.in.audio.channels); + swr_alloc_set_opts2( + &swrContext_, + &channel_out, + (AVSampleFormat)params.out.audio.format, + params.out.audio.samples, + &channel_in, + (AVSampleFormat)params.in.audio.format, + params.in.audio.samples, + 0, + logCtx_); +#else swrContext_ = swr_alloc_set_opts( nullptr, av_get_default_channel_layout(params.out.audio.channels), @@ -58,6 +75,7 @@ bool AudioSampler::init(const SamplerParameters& params) { params.in.audio.samples, 0, logCtx_); +#endif if (swrContext_ == nullptr) { LOG(ERROR) << "Cannot allocate SwrContext"; return false; diff --git a/torchvision/csrc/io/decoder/audio_stream.cpp b/torchvision/csrc/io/decoder/audio_stream.cpp index 0f6c57e5588..c3a003434b8 100644 --- a/torchvision/csrc/io/decoder/audio_stream.cpp +++ b/torchvision/csrc/io/decoder/audio_stream.cpp @@ -1,31 +1,40 @@ #include "audio_stream.h" #include -#include #include "util.h" namespace ffmpeg { namespace { +static int get_nb_channels(const AVFrame* frame, const AVCodecContext* codec) { +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) + return frame ? frame->ch_layout.nb_channels : codec->ch_layout.nb_channels; +#else + return frame ? frame->channels : codec->channels; +#endif +} + bool operator==(const AudioFormat& x, const AVFrame& y) { return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(y.channels) && x.format == y.format; + x.channels == static_cast(get_nb_channels(&y, nullptr)) && + x.format == y.format; } bool operator==(const AudioFormat& x, const AVCodecContext& y) { return x.samples == static_cast(y.sample_rate) && - x.channels == static_cast(y.channels) && x.format == y.sample_fmt; + x.channels == static_cast(get_nb_channels(nullptr, &y)) && + x.format == y.sample_fmt; } AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) { x.samples = y.sample_rate; - x.channels = y.channels; + x.channels = get_nb_channels(&y, nullptr); x.format = y.format; return x; } AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) { x.samples = y.sample_rate; - x.channels = y.channels; + x.channels = get_nb_channels(nullptr, &y); x.format = y.sample_fmt; return x; } @@ -54,9 +63,15 @@ int AudioStream::initFormat() { if (format_.format.audio.samples == 0) { format_.format.audio.samples = codecCtx_->sample_rate; } +#if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(57, 28, 100) + if (format_.format.audio.channels == 0) { + format_.format.audio.channels = codecCtx_->ch_layout.nb_channels; + } +#else if (format_.format.audio.channels == 0) { format_.format.audio.channels = codecCtx_->channels; } +#endif if (format_.format.audio.format == AV_SAMPLE_FMT_NONE) { format_.format.audio.format = codecCtx_->sample_fmt; } diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp index f13e2c3ffcf..cfe762bbc6e 100644 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ b/torchvision/csrc/io/decoder/decoder.cpp @@ -285,6 +285,8 @@ bool Decoder::init( return false; } + avioCtx_->max_packet_size = params.maxEncodedBufferSize; + inputCtx_->pb = avioCtx_; inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; } @@ -312,6 +314,8 @@ bool Decoder::init( } } + av_dict_set_int(&options, "probesize", params_.probeSize, 0); + interrupted_ = false; // ffmpeg avformat_open_input call can hang if media source doesn't respond @@ -380,7 +384,30 @@ bool Decoder::init( av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD); } + for (unsigned int i = 0; i < inputCtx_->nb_streams; i++) { + if ( +#if LIBAVUTIL_VERSION_MAJOR < 56 // Before FFMPEG 4.0 + inputCtx_->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO +#else // FFMPEG 4.0+ + inputCtx_->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO +#endif + && inputCtx_->streams[i]->duration > 0) { + // There is at least two 1/r_frame_rates from the frame before the last + // one until the video duration, let's prefer to set duration after the + // frame before the last one, but as early as possible + double correction = 2 * inputCtx_->streams[i]->r_frame_rate.den / + (double)inputCtx_->streams[i]->r_frame_rate.num - + 1 / (double)AV_TIME_BASE; + videoDurationMs_ = 1000 * inputCtx_->streams[i]->duration * + inputCtx_->streams[i]->time_base.num / + (double)inputCtx_->streams[i]->time_base.den - + 1000 * correction; + break; + } + } + VLOG(1) << "Decoder initialized, log level: " << params_.logLevel; + VLOG(1) << "Video duration: " << videoDurationMs_; return true; } @@ -418,20 +445,20 @@ bool Decoder::openStreams(std::vector* metadata) { if (it->stream == -2 || // all streams of this type are welcome (!stream && (it->stream == -1 || it->stream == i))) { // new stream VLOG(1) << "Stream type: " << format.type << " found, at index: " << i; - auto stream = createStream( + auto stream_2 = createStream( format.type, inputCtx_, i, params_.convertPtsToWallTime, it->format, params_.loggingUuid); - CHECK(stream); - if (stream->openCodec(metadata, params_.numThreads) < 0) { + CHECK(stream_2); + if (stream_2->openCodec(metadata, params_.numThreads) < 0) { LOG(ERROR) << "uuid=" << params_.loggingUuid << " open codec failed, stream_idx=" << i; return false; } - streams_.emplace(i, std::move(stream)); + streams_.emplace(i, std::move(stream_2)); inRange_.set(i, true); } } @@ -588,13 +615,30 @@ int Decoder::getFrame(size_t workingTimeInMs) { result = 0; av_packet_unref(avPacket); + + if (params_.uniformSampling > 1) { + if (doSeek_) { + double duration = + videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; + double step = + (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); + avformat_seek_file( + inputCtx_, + -1, + static_cast(step * kFramesDecoded_) + 1, + static_cast(step * (kFramesDecoded_ + 1)), + static_cast(step * (kFramesDecoded_ + 1)), + 0); + ++kFramesDecoded_; + doSeek_ = false; + } + } } av_packet_free(&avPacket); - VLOG(2) << "Interrupted loop" - << ", interrupted_ " << interrupted_ << ", inRange_.any() " - << inRange_.any() << ", decodedFrame " << decodedFrame << ", result " - << result; + VLOG(2) << "Interrupted loop" << ", interrupted_ " << interrupted_ + << ", inRange_.any() " << inRange_.any() << ", decodedFrame " + << decodedFrame << ", result " << result; // loop can be terminated, either by: // 1. explicitly interrupted @@ -658,13 +702,35 @@ int Decoder::processPacket( startCondition = msg.header.pts >= params_.startOffset; } if (endInRange && startCondition) { - *hasMsg = true; - push(std::move(msg)); + *hasMsg = pushMsg(std::move(msg)); } } return result; } +bool Decoder::pushMsg(DecoderOutputMessage&& msg) { + pastDecodedPTS_ = currentDecodedPTS_; + currentDecodedPTS_ = msg.header.pts; + + if (params_.uniformSampling <= 1) { + push(std::move(msg)); + return true; + } + + double duration = + videoDurationMs_ > 0 ? videoDurationMs_ : params_.expectedDuration; + double step = + (duration * AV_TIME_BASE) / (1000 * (params_.uniformSampling - 1)); + if (pastDecodedPTS_ < step * kFramesDecoded_ && + step * kFramesDecoded_ <= currentDecodedPTS_) { + push(std::move(msg)); + doSeek_ = true; + return true; + } + + return false; +} + void Decoder::flushStreams() { VLOG(1) << "Flushing streams..."; for (auto& stream : streams_) { @@ -676,7 +742,7 @@ void Decoder::flushStreams() { params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; inRange_.set(stream.second->getIndex(), endInRange); if (endInRange && msg.header.pts >= params_.startOffset) { - push(std::move(msg)); + pushMsg(std::move(msg)); } else { msg.payload.reset(); } diff --git a/torchvision/csrc/io/decoder/decoder.h b/torchvision/csrc/io/decoder/decoder.h index 44d6676aa6b..172a011f93e 100644 --- a/torchvision/csrc/io/decoder/decoder.h +++ b/torchvision/csrc/io/decoder/decoder.h @@ -56,6 +56,7 @@ class Decoder : public MediaDecoder { int* getPrintPrefix() { return &printPrefix; } + double videoDurationMs_ = -1; private: // mark below function for a proper invocation @@ -76,6 +77,8 @@ class Decoder : public MediaDecoder { bool fastSeek = false); void flushStreams(); void cleanUp(); + bool pushMsg(DecoderOutputMessage&& + msg); // returns whether frame is passed to downstream protected: DecoderParameters params_; @@ -89,5 +92,9 @@ class Decoder : public MediaDecoder { AVIOContext* avioCtx_{nullptr}; std::unordered_map> streams_; std::bitset<64> inRange_; + int kFramesDecoded_{0}; + int64_t pastDecodedPTS_{-1}; + int64_t currentDecodedPTS_{-1}; + bool doSeek_{false}; }; } // namespace ffmpeg diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h index dac6293d366..d2dc5c7935b 100644 --- a/torchvision/csrc/io/decoder/defs.h +++ b/torchvision/csrc/io/decoder/defs.h @@ -165,7 +165,7 @@ struct MediaFormat { struct DecoderParameters { // local file, remote file, http url, rtmp stream uri, etc. anything that // ffmpeg can recognize - std::string uri; + std::string uri{std::string()}; // timeout on getting bytes for decoding size_t timeoutMs{1000}; // logging level, default AV_LOG_PANIC @@ -213,6 +213,23 @@ struct DecoderParameters { // Skip packets that fail with EPERM errors and continue decoding. bool skipOperationNotPermittedPackets{false}; + + // probing size in bytes, i.e. the size of the data to analyze to get stream + // information. A higher value will enable detecting more information in case + // it is dispersed into the stream, but will increase latency. Must be an + // integer not lesser than 32. It is 5000000 by default. + int64_t probeSize{5000000}; + + // Expected duration of the video to be decoded, mainly used with uniform + // sampling + float expectedDuration{0.0f}; + + // Sample N key-frames from the video roughly uniformly across the timeline + int uniformSampling{0}; + + // with 0, ffmpeg allocates buffers of size 32768 bytes for encoded frames. + // Override this with bigger buffer size if needed. + int64_t maxEncodedBufferSize{0}; }; struct DecoderHeader { @@ -295,7 +312,7 @@ struct DecoderMetadata { }; /** * Abstract class for decoding media bytes - * It has two diffrent modes. Internal media bytes retrieval for given uri and + * It has two different modes. Internal media bytes retrieval for given uri and * external media bytes provider in case of memory streams */ class MediaDecoder { diff --git a/torchvision/csrc/io/decoder/gpu/README.rst b/torchvision/csrc/io/decoder/gpu/README.rst index cebd31cb557..e4573d7fe75 100644 --- a/torchvision/csrc/io/decoder/gpu/README.rst +++ b/torchvision/csrc/io/decoder/gpu/README.rst @@ -18,4 +18,4 @@ GPU decoder depends on ffmpeg for demuxing, uses NVDECODE APIs from the nvidia-v .. code:: bash - python setup.py install + pip install . -v --no-build-isolation diff --git a/torchvision/csrc/io/decoder/gpu/decoder.cpp b/torchvision/csrc/io/decoder/gpu/decoder.cpp index 0e451298825..f7377ede38b 100644 --- a/torchvision/csrc/io/decoder/gpu/decoder.cpp +++ b/torchvision/csrc/io/decoder/gpu/decoder.cpp @@ -59,7 +59,7 @@ void Decoder::release() { if (decoder) { cuvidDestroyDecoder(decoder); } - cuCtxPopCurrent(NULL); + cuCtxPopCurrent(nullptr); } /* Trigger video decoding. @@ -100,7 +100,7 @@ int Decoder::handle_picture_decode(CUVIDPICPARAMS* pic_params) { check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); check_for_cuda_errors( cuvidDecodePicture(decoder, pic_params), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__); + check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); return 1; } @@ -143,7 +143,8 @@ int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) { uint8_t* frame_ptr = decoded_frame.data_ptr(); const uint8_t* const source_arr[] = { (const uint8_t* const)source_frame, - (const uint8_t* const)(source_frame + source_pitch * ((surface_height + 1) & ~1))}; + (const uint8_t* const)(source_frame + + source_pitch * ((surface_height + 1) & ~1))}; auto err = nppiNV12ToRGB_709CSC_8u_P2C3R( source_arr, @@ -159,7 +160,7 @@ int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) { check_for_cuda_errors(cuStreamSynchronize(cuvidStream), __LINE__, __FILE__); decoded_frames.push(decoded_frame); - check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__); + check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); check_for_cuda_errors( cuvidUnmapVideoFrame(decoder, source_frame), __LINE__, __FILE__); @@ -177,7 +178,7 @@ void Decoder::query_hardware(CUVIDEOFORMAT* video_format) { check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); check_for_cuda_errors(cuvidGetDecoderCaps(&decode_caps), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__); + check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); if (!decode_caps.bIsSupported) { TORCH_CHECK(false, "Codec not supported on this GPU"); @@ -319,7 +320,7 @@ int Decoder::handle_video_sequence(CUVIDEOFORMAT* video_format) { cuvidCreateDecoder(&decoder, &video_decode_create_info), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__); + check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); return decode_surface; } @@ -389,7 +390,7 @@ int Decoder::reconfigure_decoder(CUVIDEOFORMAT* video_format) { check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__); check_for_cuda_errors( cuvidReconfigureDecoder(decoder, &reconfig_params), __LINE__, __FILE__); - check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__); + check_for_cuda_errors(cuCtxPopCurrent(nullptr), __LINE__, __FILE__); return decode_surface; } diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp index a7b0128e3ed..4e420c3b3cd 100644 --- a/torchvision/csrc/io/decoder/memory_buffer.cpp +++ b/torchvision/csrc/io/decoder/memory_buffer.cpp @@ -61,7 +61,7 @@ DecoderInCallback MemoryBuffer::getCallback( } // seek mode if (!timeoutMs) { - // seek capabilty, yes - supported + // seek capability, yes - supported return 0; } return object.seek(size, whence); diff --git a/torchvision/csrc/io/decoder/stream.cpp b/torchvision/csrc/io/decoder/stream.cpp index 0d625ef211c..7969741e72c 100644 --- a/torchvision/csrc/io/decoder/stream.cpp +++ b/torchvision/csrc/io/decoder/stream.cpp @@ -1,6 +1,5 @@ #include "stream.h" #include -#include #include #include "util.h" @@ -63,15 +62,8 @@ int Stream::openCodec(std::vector* metadata, int num_threads) { codecCtx_->thread_count = num_threads; } else { // otherwise set sensible defaults - // with the special case for the different MPEG4 codecs - // that don't have threading context functions - if (codecCtx_->codec->capabilities & AV_CODEC_CAP_INTRA_ONLY) { - codecCtx_->thread_type = FF_THREAD_FRAME; - codecCtx_->thread_count = 2; - } else { - codecCtx_->thread_count = 8; - codecCtx_->thread_type = FF_THREAD_SLICE; - } + codecCtx_->thread_count = 8; + codecCtx_->thread_type = FF_THREAD_SLICE; } int ret; diff --git a/torchvision/csrc/io/decoder/subtitle_stream.cpp b/torchvision/csrc/io/decoder/subtitle_stream.cpp index 27c61d4dbd9..3416f702d7e 100644 --- a/torchvision/csrc/io/decoder/subtitle_stream.cpp +++ b/torchvision/csrc/io/decoder/subtitle_stream.cpp @@ -1,6 +1,5 @@ #include "subtitle_stream.h" #include -#include #include "util.h" namespace ffmpeg { diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp index 936d1e94f46..085966ce687 100644 --- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp +++ b/torchvision/csrc/io/decoder/sync_decoder_test.cpp @@ -158,8 +158,7 @@ void runDecoder(SyncDecoder& decoder) { AVSubtitle sub; memset(&sub, 0, sizeof(sub)); EXPECT_TRUE(Util::deserialize(*out.payload, &sub)); - LOG(INFO) << "Found subtitles" - << ", num rects: " << sub.num_rects; + LOG(INFO) << "Found subtitles" << ", num rects: " << sub.num_rects; for (int i = 0; i < sub.num_rects; ++i) { std::string text = "picture"; if (sub.rects[i]->type == SUBTITLE_TEXT) { @@ -210,9 +209,9 @@ TEST(SyncDecoder, TestSyncDecoderPerformance) { auto new8x8 = measurePerformanceUs(stats, kRounds, 8, 8); auto new16x8 = measurePerformanceUs(stats, kRounds, 16, 8); auto new32x4 = measurePerformanceUs(stats, kRounds, 32, 4); - LOG(INFO) << "Clip decoding (us)" - << ", new(4x2): " << new4x2 << ", new(8x8): " << new8x8 - << ", new(16x8): " << new16x8 << ", new(32x4): " << new32x4; + LOG(INFO) << "Clip decoding (us)" << ", new(4x2): " << new4x2 + << ", new(8x8): " << new8x8 << ", new(16x8): " << new16x8 + << ", new(32x4): " << new32x4; } TEST(SyncDecoder, Test) { @@ -368,7 +367,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) { } // seek mode if (!timeoutMs) { - // seek capabilty, yes - no + // seek capability, yes - no return -1; } return object.seek(size, whence); @@ -408,7 +407,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) { } // seek mode if (!timeoutMs) { - // seek capabilty, yes - no + // seek capability, yes - no return -1; } return object.seek(size, whence); diff --git a/torchvision/csrc/io/decoder/util.cpp b/torchvision/csrc/io/decoder/util.cpp index 2ecd7512c06..7198d2174ed 100644 --- a/torchvision/csrc/io/decoder/util.cpp +++ b/torchvision/csrc/io/decoder/util.cpp @@ -265,7 +265,6 @@ std::string generateErrorDesc(int errorCode) { size_t serialize(const AVSubtitle& sub, ByteStorage* out) { const auto len = size(sub); - TORCH_CHECK_LE(len, out->tail()); size_t pos = 0; if (!Serializer::serializeItem(out->writableTail(), len, pos, sub)) { return 0; diff --git a/torchvision/csrc/io/decoder/util_test.cpp b/torchvision/csrc/io/decoder/util_test.cpp index 78de08b7139..0a093d9561b 100644 --- a/torchvision/csrc/io/decoder/util_test.cpp +++ b/torchvision/csrc/io/decoder/util_test.cpp @@ -1,5 +1,4 @@ #include -#include #include #include "util.h" diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp index 62ec0709be1..8b712609e34 100644 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ b/torchvision/csrc/io/decoder/video_sampler.cpp @@ -181,6 +181,23 @@ bool VideoSampler::init(const SamplerParameters& params) { // set output format params_ = params; + if (params.in.video.format == AV_PIX_FMT_YUV420P) { + /* When the video width and height are not multiples of 8, + * and there is no size change in the conversion, + * a blurry screen will appear on the right side + * This problem was discovered in 2012 and + * continues to exist in version 4.1.3 in 2019 + * This problem can be avoided by increasing SWS_ACCURATE_RND + * details https://trac.ffmpeg.org/ticket/1582 + */ + if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) { + VLOG(1) << "The width " << params.in.video.width << " and height " + << params.in.video.height << " the image is not a multiple of 8, " + << "the decoding speed may be reduced"; + swsFlags_ |= SWS_ACCURATE_RND; + } + } + scaleContext_ = sws_getContext( params.in.video.width, params.in.video.height, diff --git a/torchvision/csrc/io/image/common.cpp b/torchvision/csrc/io/image/common.cpp new file mode 100644 index 00000000000..7743961a09d --- /dev/null +++ b/torchvision/csrc/io/image/common.cpp @@ -0,0 +1,50 @@ + +#include "common.h" + +// If we are in a Windows environment, we need to define +// initialization functions for the _custom_ops extension +#ifdef _WIN32 +void* PyInit_image(void) { + return nullptr; +} +#endif + +namespace vision { +namespace image { + +void validate_encoded_data(const torch::Tensor& encoded_data) { + TORCH_CHECK(encoded_data.is_contiguous(), "Input tensor must be contiguous."); + TORCH_CHECK( + encoded_data.dtype() == torch::kU8, + "Input tensor must have uint8 data type, got ", + encoded_data.dtype()); + TORCH_CHECK( + encoded_data.dim() == 1 && encoded_data.numel() > 0, + "Input tensor must be 1-dimensional and non-empty, got ", + encoded_data.dim(), + " dims and ", + encoded_data.numel(), + " numels."); +} + +bool should_this_return_rgb_or_rgba_let_me_know_in_the_comments_down_below_guys_see_you_in_the_next_video( + ImageReadMode mode, + bool has_alpha) { + // Return true if the calling decoding function should return a 3D RGB tensor, + // and false if it should return a 4D RGBA tensor. + // This function ignores the requested "grayscale" modes and treats it as + // "unchanged", so it should only used on decoders who don't support grayscale + // outputs. + + if (mode == IMAGE_READ_MODE_RGB) { + return true; + } + if (mode == IMAGE_READ_MODE_RGB_ALPHA) { + return false; + } + // From here we assume mode is "unchanged", even for grayscale ones. + return !has_alpha; +} + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/image_read_mode.h b/torchvision/csrc/io/image/common.h similarity index 65% rename from torchvision/csrc/io/image/image_read_mode.h rename to torchvision/csrc/io/image/common.h index 84425265c34..d81acfda7d4 100644 --- a/torchvision/csrc/io/image/image_read_mode.h +++ b/torchvision/csrc/io/image/common.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace vision { namespace image { @@ -13,5 +14,11 @@ const ImageReadMode IMAGE_READ_MODE_GRAY_ALPHA = 2; const ImageReadMode IMAGE_READ_MODE_RGB = 3; const ImageReadMode IMAGE_READ_MODE_RGB_ALPHA = 4; +void validate_encoded_data(const torch::Tensor& encoded_data); + +bool should_this_return_rgb_or_rgba_let_me_know_in_the_comments_down_below_guys_see_you_in_the_next_video( + ImageReadMode mode, + bool has_alpha); + } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_gif.cpp b/torchvision/csrc/io/image/cpu/decode_gif.cpp new file mode 100644 index 00000000000..f26d37950e3 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/decode_gif.cpp @@ -0,0 +1,165 @@ +#include "decode_gif.h" +#include +#include "../common.h" +#include "giflib/gif_lib.h" + +namespace vision { +namespace image { + +typedef struct reader_helper_t { + uint8_t const* encoded_data; // input tensor data pointer + size_t encoded_data_size; // size of input tensor in bytes + size_t num_bytes_read; // number of bytes read so far in the tensor +} reader_helper_t; + +// That function is used by GIFLIB routines to read the encoded bytes. +// This reads `len` bytes and writes them into `buf`. The data is read from the +// input tensor passed to decode_gif() starting at the `num_bytes_read` +// position. +int read_from_tensor(GifFileType* gifFile, GifByteType* buf, int len) { + // the UserData field was set in DGifOpen() + reader_helper_t* reader_helper = + static_cast(gifFile->UserData); + + size_t num_bytes_to_read = std::min( + (size_t)len, + reader_helper->encoded_data_size - reader_helper->num_bytes_read); + std::memcpy( + buf, reader_helper->encoded_data + reader_helper->num_bytes_read, len); + reader_helper->num_bytes_read += num_bytes_to_read; + return num_bytes_to_read; +} + +torch::Tensor decode_gif(const torch::Tensor& encoded_data) { + // LibGif docs: https://giflib.sourceforge.net/intro.html + // Refer over there for more details on the libgif API, API ref, and a + // detailed description of the GIF format. + + validate_encoded_data(encoded_data); + + int error = D_GIF_SUCCEEDED; + + // We're using DGidOpen. The other entrypoints of libgif are + // DGifOpenFileName and DGifOpenFileHandle but we don't want to use those, + // since we need to read the encoded bytes from a tensor of encoded bytes, not + // from a file (for consistency with existing jpeg and png decoders). Using + // DGifOpen is the only way to read from a custom source. + // For that we need to provide a reader function `read_from_tensor` that + // reads from the tensor, and we have to keep track of the number of bytes + // read so far: this is why we need the reader_helper struct. + + // TODO: We are potentially doing an unnecessary copy of the encoded bytes: + // - 1 copy in from file to tensor (in read_file()) + // - 1 copy from tensor to GIFLIB buffers (in read_from_tensor()) + // Since we're vendoring GIFLIB we can potentially modify the calls to + // InternalRead() and just set the `buf` pointer to the tensor data directly. + // That might even save allocation of those buffers. + // If we do that, we'd have to make sure the buffers are never written to by + // GIFLIB, otherwise we'd be overridding the tensor data. + reader_helper_t reader_helper; + reader_helper.encoded_data = encoded_data.data_ptr(); + reader_helper.encoded_data_size = encoded_data.numel(); + reader_helper.num_bytes_read = 0; + GifFileType* gifFile = + DGifOpen(static_cast(&reader_helper), read_from_tensor, &error); + + TORCH_CHECK( + (gifFile != nullptr) && (error == D_GIF_SUCCEEDED), + "DGifOpenFileName() failed - ", + error); + + if (DGifSlurp(gifFile) == GIF_ERROR) { + auto gifFileError = gifFile->Error; + DGifCloseFile(gifFile, &error); + TORCH_CHECK(false, "DGifSlurp() failed - ", gifFileError); + } + auto num_images = gifFile->ImageCount; + + // This check should already done within DGifSlurp(), just to be safe + TORCH_CHECK(num_images > 0, "GIF file should contain at least one image!"); + + GifColorType bg = {0, 0, 0}; + if (gifFile->SColorMap) { + bg = gifFile->SColorMap->Colors[gifFile->SBackGroundColor]; + } + + // The GIFLIB docs say that the canvas's height and width are potentially + // ignored by modern viewers, so to be on the safe side we set the output + // height to max(canvas_heigh, first_image_height). Same for width. + // https://giflib.sourceforge.net/whatsinagif/bits_and_bytes.html + auto out_h = + std::max(gifFile->SHeight, gifFile->SavedImages[0].ImageDesc.Height); + auto out_w = + std::max(gifFile->SWidth, gifFile->SavedImages[0].ImageDesc.Width); + + // We output a channels-last tensor for consistency with other image decoders. + // Torchvision's resize tends to be is faster on uint8 channels-last tensors. + auto options = torch::TensorOptions() + .dtype(torch::kU8) + .memory_format(torch::MemoryFormat::ChannelsLast); + auto out = torch::empty( + {int64_t(num_images), 3, int64_t(out_h), int64_t(out_w)}, options); + auto out_a = out.accessor(); + for (int i = 0; i < num_images; i++) { + const SavedImage& img = gifFile->SavedImages[i]; + + GraphicsControlBlock gcb; + DGifSavedExtensionToGCB(gifFile, i, &gcb); + + const GifImageDesc& desc = img.ImageDesc; + const ColorMapObject* cmap = + desc.ColorMap ? desc.ColorMap : gifFile->SColorMap; + TORCH_CHECK( + cmap != nullptr, + "Global and local color maps are missing. This should never happen!"); + + // When going from one image to another, there is a "disposal method" which + // specifies how to handle the transition. E.g. DISPOSE_DO_NOT means that + // the current image should essentially be drawn on top of the previous + // canvas. The pixels of that previous canvas will appear on the new one if + // either: + // - a pixel is transparent in the current image + // - the current image is smaller than the canvas, hence exposing its pixels + // The "background" disposal method means that the current canvas should be + // set to the background color. + // We only support these 2 modes and default to "background" when the + // disposal method is unspecified, or when it's set to "DISPOSE_PREVIOUS" + // which according to GIFLIB is not widely supported. + // (https://giflib.sourceforge.net/whatsinagif/animation_and_transparency.html). + if (i > 0 && gcb.DisposalMode == DISPOSE_DO_NOT) { + out[i] = out[i - 1]; + } else { + // Background. If bg wasn't defined, it will be (0, 0, 0) + for (int h = 0; h < gifFile->SHeight; h++) { + for (int w = 0; w < gifFile->SWidth; w++) { + out_a[i][0][h][w] = bg.Red; + out_a[i][1][h][w] = bg.Green; + out_a[i][2][h][w] = bg.Blue; + } + } + } + + for (int h = 0; h < desc.Height; h++) { + for (int w = 0; w < desc.Width; w++) { + auto c = img.RasterBits[h * desc.Width + w]; + if (c == gcb.TransparentColor) { + continue; + } + GifColorType rgb = cmap->Colors[c]; + out_a[i][0][h + desc.Top][w + desc.Left] = rgb.Red; + out_a[i][1][h + desc.Top][w + desc.Left] = rgb.Green; + out_a[i][2][h + desc.Top][w + desc.Left] = rgb.Blue; + } + } + } + + out = out.squeeze(0); // remove batch dim if there's only one image + + DGifCloseFile(gifFile, &error); + TORCH_CHECK(error == D_GIF_SUCCEEDED, "DGifCloseFile() failed - ", error); + + return out; +} + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_gif.h b/torchvision/csrc/io/image/cpu/decode_gif.h new file mode 100644 index 00000000000..68d5073c91b --- /dev/null +++ b/torchvision/csrc/io/image/cpu/decode_gif.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace vision { +namespace image { + +// encoded_data tensor must be 1D uint8 and contiguous +C10_EXPORT torch::Tensor decode_gif(const torch::Tensor& encoded_data); + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp index 1cc05dc76ca..43a688604f6 100644 --- a/torchvision/csrc/io/image/cpu/decode_image.cpp +++ b/torchvision/csrc/io/image/cpu/decode_image.cpp @@ -1,12 +1,19 @@ #include "decode_image.h" +#include "decode_gif.h" #include "decode_jpeg.h" #include "decode_png.h" +#include "decode_webp.h" namespace vision { namespace image { -torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) { +torch::Tensor decode_image( + const torch::Tensor& data, + ImageReadMode mode, + bool apply_exif_orientation) { + // Check that tensor is a CPU tensor + TORCH_CHECK(data.device() == torch::kCPU, "Expected a CPU tensor"); // Check that the input tensor dtype is uint8 TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor"); // Check that the input tensor is 1-dimensional @@ -14,21 +21,43 @@ torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) { data.dim() == 1 && data.numel() > 0, "Expected a non empty 1-dimensional tensor"); + auto err_msg = + "Unsupported image file. Only jpeg, png, webp and gif are currently supported. For avif and heic format, please rely on `decode_avif` and `decode_heic` directly."; + auto datap = data.data_ptr(); const uint8_t jpeg_signature[3] = {255, 216, 255}; // == "\xFF\xD8\xFF" + TORCH_CHECK(data.numel() >= 3, err_msg); + if (memcmp(jpeg_signature, datap, 3) == 0) { + return decode_jpeg(data, mode, apply_exif_orientation); + } + const uint8_t png_signature[4] = {137, 80, 78, 71}; // == "\211PNG" + TORCH_CHECK(data.numel() >= 4, err_msg); + if (memcmp(png_signature, datap, 4) == 0) { + return decode_png(data, mode, apply_exif_orientation); + } - if (memcmp(jpeg_signature, datap, 3) == 0) { - return decode_jpeg(data, mode); - } else if (memcmp(png_signature, datap, 4) == 0) { - return decode_png(data, mode); - } else { - TORCH_CHECK( - false, - "Unsupported image file. Only jpeg and png ", - "are currently supported."); + const uint8_t gif_signature_1[6] = { + 0x47, 0x49, 0x46, 0x38, 0x39, 0x61}; // == "GIF89a" + const uint8_t gif_signature_2[6] = { + 0x47, 0x49, 0x46, 0x38, 0x37, 0x61}; // == "GIF87a" + TORCH_CHECK(data.numel() >= 6, err_msg); + if (memcmp(gif_signature_1, datap, 6) == 0 || + memcmp(gif_signature_2, datap, 6) == 0) { + return decode_gif(data); + } + + const uint8_t webp_signature_begin[4] = {0x52, 0x49, 0x46, 0x46}; // == "RIFF" + const uint8_t webp_signature_end[7] = { + 0x57, 0x45, 0x42, 0x50, 0x56, 0x50, 0x38}; // == "WEBPVP8" + TORCH_CHECK(data.numel() >= 15, err_msg); + if ((memcmp(webp_signature_begin, datap, 4) == 0) && + (memcmp(webp_signature_end, datap + 8, 7) == 0)) { + return decode_webp(data, mode); } + + TORCH_CHECK(false, err_msg); } } // namespace image diff --git a/torchvision/csrc/io/image/cpu/decode_image.h b/torchvision/csrc/io/image/cpu/decode_image.h index 853d6d91afa..f66d47eccd4 100644 --- a/torchvision/csrc/io/image/cpu/decode_image.h +++ b/torchvision/csrc/io/image/cpu/decode_image.h @@ -1,14 +1,15 @@ #pragma once #include -#include "../image_read_mode.h" +#include "../common.h" namespace vision { namespace image { C10_EXPORT torch::Tensor decode_image( const torch::Tensor& data, - ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED); + ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED, + bool apply_exif_orientation = false); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp index 6ec644d003e..052b98e1be9 100644 --- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp +++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp @@ -1,17 +1,23 @@ #include "decode_jpeg.h" +#include "../common.h" #include "common_jpeg.h" +#include "exif.h" namespace vision { namespace image { #if !JPEG_FOUND -torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { +torch::Tensor decode_jpeg( + const torch::Tensor& data, + ImageReadMode mode, + bool apply_exif_orientation) { TORCH_CHECK( false, "decode_jpeg: torchvision not compiled with libjpeg support"); } #else using namespace detail; +using namespace exif_private; namespace { @@ -65,19 +71,72 @@ static void torch_jpeg_set_source_mgr( src->len = len; src->pub.bytes_in_buffer = len; src->pub.next_input_byte = src->data; + + jpeg_save_markers(cinfo, APP1, 0xffff); +} + +inline unsigned char clamped_cmyk_rgb_convert( + unsigned char k, + unsigned char cmy) { + // Inspired from Pillow: + // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L568-L569 + int v = k * cmy + 128; + v = ((v >> 8) + v) >> 8; + return std::clamp(k - v, 0, 255); +} + +void convert_line_cmyk_to_rgb( + j_decompress_ptr cinfo, + const unsigned char* cmyk_line, + unsigned char* rgb_line) { + int width = cinfo->output_width; + for (int i = 0; i < width; ++i) { + int c = cmyk_line[i * 4 + 0]; + int m = cmyk_line[i * 4 + 1]; + int y = cmyk_line[i * 4 + 2]; + int k = cmyk_line[i * 4 + 3]; + + rgb_line[i * 3 + 0] = clamped_cmyk_rgb_convert(k, 255 - c); + rgb_line[i * 3 + 1] = clamped_cmyk_rgb_convert(k, 255 - m); + rgb_line[i * 3 + 2] = clamped_cmyk_rgb_convert(k, 255 - y); + } +} + +inline unsigned char rgb_to_gray(int r, int g, int b) { + // Inspired from Pillow: + // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L226 + return (r * 19595 + g * 38470 + b * 7471 + 0x8000) >> 16; +} + +void convert_line_cmyk_to_gray( + j_decompress_ptr cinfo, + const unsigned char* cmyk_line, + unsigned char* gray_line) { + int width = cinfo->output_width; + for (int i = 0; i < width; ++i) { + int c = cmyk_line[i * 4 + 0]; + int m = cmyk_line[i * 4 + 1]; + int y = cmyk_line[i * 4 + 2]; + int k = cmyk_line[i * 4 + 3]; + + int r = clamped_cmyk_rgb_convert(k, 255 - c); + int g = clamped_cmyk_rgb_convert(k, 255 - m); + int b = clamped_cmyk_rgb_convert(k, 255 - y); + + gray_line[i] = rgb_to_gray(r, g, b); + } } } // namespace -torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { +torch::Tensor decode_jpeg( + const torch::Tensor& data, + ImageReadMode mode, + bool apply_exif_orientation) { C10_LOG_API_USAGE_ONCE( "torchvision.csrc.io.image.cpu.decode_jpeg.decode_jpeg"); - // Check that the input tensor dtype is uint8 - TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor"); - // Check that the input tensor is 1-dimensional - TORCH_CHECK( - data.dim() == 1 && data.numel() > 0, - "Expected a non empty 1-dimensional tensor"); + + validate_encoded_data(data); struct jpeg_decompress_struct cinfo; struct torch_jpeg_error_mgr jerr; @@ -102,20 +161,29 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { jpeg_read_header(&cinfo, TRUE); int channels = cinfo.num_components; + bool cmyk_to_rgb_or_gray = false; if (mode != IMAGE_READ_MODE_UNCHANGED) { switch (mode) { case IMAGE_READ_MODE_GRAY: - if (cinfo.jpeg_color_space != JCS_GRAYSCALE) { + if (cinfo.jpeg_color_space == JCS_CMYK || + cinfo.jpeg_color_space == JCS_YCCK) { + cinfo.out_color_space = JCS_CMYK; + cmyk_to_rgb_or_gray = true; + } else { cinfo.out_color_space = JCS_GRAYSCALE; - channels = 1; } + channels = 1; break; case IMAGE_READ_MODE_RGB: - if (cinfo.jpeg_color_space != JCS_RGB) { + if (cinfo.jpeg_color_space == JCS_CMYK || + cinfo.jpeg_color_space == JCS_YCCK) { + cinfo.out_color_space = JCS_CMYK; + cmyk_to_rgb_or_gray = true; + } else { cinfo.out_color_space = JCS_RGB; - channels = 3; } + channels = 3; break; /* * Libjpeg does not support converting from CMYK to grayscale etc. There @@ -130,6 +198,11 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { jpeg_calc_output_dimensions(&cinfo); } + int exif_orientation = -1; + if (apply_exif_orientation) { + exif_orientation = fetch_jpeg_exif_orientation(&cinfo); + } + jpeg_start_decompress(&cinfo); int height = cinfo.output_height; @@ -139,21 +212,57 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { auto tensor = torch::empty({int64_t(height), int64_t(width), channels}, torch::kU8); auto ptr = tensor.data_ptr(); + torch::Tensor cmyk_line_tensor; + if (cmyk_to_rgb_or_gray) { + cmyk_line_tensor = torch::empty({int64_t(width), 4}, torch::kU8); + } + while (cinfo.output_scanline < cinfo.output_height) { /* jpeg_read_scanlines expects an array of pointers to scanlines. * Here the array is only one element long, but you could ask for * more than one scanline at a time if that's more convenient. */ - jpeg_read_scanlines(&cinfo, &ptr, 1); + if (cmyk_to_rgb_or_gray) { + auto cmyk_line_ptr = cmyk_line_tensor.data_ptr(); + jpeg_read_scanlines(&cinfo, &cmyk_line_ptr, 1); + + if (channels == 3) { + convert_line_cmyk_to_rgb(&cinfo, cmyk_line_ptr, ptr); + } else if (channels == 1) { + convert_line_cmyk_to_gray(&cinfo, cmyk_line_ptr, ptr); + } + } else { + jpeg_read_scanlines(&cinfo, &ptr, 1); + } ptr += stride; } jpeg_finish_decompress(&cinfo); jpeg_destroy_decompress(&cinfo); - return tensor.permute({2, 0, 1}); + auto output = tensor.permute({2, 0, 1}); + + if (apply_exif_orientation) { + return exif_orientation_transform(output, exif_orientation); + } + return output; +} +#endif // #if !JPEG_FOUND + +int64_t _jpeg_version() { +#if JPEG_FOUND + return JPEG_LIB_VERSION; +#else + return -1; +#endif } +bool _is_compiled_against_turbo() { +#ifdef LIBJPEG_TURBO_VERSION + return true; +#else + return false; #endif +} } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.h b/torchvision/csrc/io/image/cpu/decode_jpeg.h index 97ed3d51a54..7412a46d2ea 100644 --- a/torchvision/csrc/io/image/cpu/decode_jpeg.h +++ b/torchvision/csrc/io/image/cpu/decode_jpeg.h @@ -1,14 +1,18 @@ #pragma once #include -#include "../image_read_mode.h" +#include "../common.h" namespace vision { namespace image { C10_EXPORT torch::Tensor decode_jpeg( const torch::Tensor& data, - ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED); + ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED, + bool apply_exif_orientation = false); + +C10_EXPORT int64_t _jpeg_version(); +C10_EXPORT bool _is_compiled_against_turbo(); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_png.cpp b/torchvision/csrc/io/image/cpu/decode_png.cpp index b1ceaf1badd..5ea6f073975 100644 --- a/torchvision/csrc/io/image/cpu/decode_png.cpp +++ b/torchvision/csrc/io/image/cpu/decode_png.cpp @@ -1,14 +1,18 @@ #include "decode_png.h" +#include "../common.h" #include "common_png.h" +#include "exif.h" namespace vision { namespace image { +using namespace exif_private; + #if !PNG_FOUND torch::Tensor decode_png( const torch::Tensor& data, ImageReadMode mode, - bool allow_16_bits) { + bool apply_exif_orientation) { TORCH_CHECK( false, "decode_png: torchvision not compiled with libPNG support"); } @@ -22,14 +26,10 @@ bool is_little_endian() { torch::Tensor decode_png( const torch::Tensor& data, ImageReadMode mode, - bool allow_16_bits) { + bool apply_exif_orientation) { C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.image.cpu.decode_png.decode_png"); - // Check that the input tensor dtype is uint8 - TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor"); - // Check that the input tensor is 1-dimensional - TORCH_CHECK( - data.dim() == 1 && data.numel() > 0, - "Expected a non empty 1-dimensional tensor"); + + validate_encoded_data(data); auto png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr); @@ -49,6 +49,7 @@ torch::Tensor decode_png( png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); TORCH_CHECK(false, "Internal error."); } + TORCH_CHECK(datap_len >= 8, "Content is too small for png!") auto is_png = !png_sig_cmp(datap, 0, 8); TORCH_CHECK(is_png, "Content is not png!") @@ -93,18 +94,19 @@ torch::Tensor decode_png( TORCH_CHECK(retval == 1, "Could read image metadata from content.") } - auto max_bit_depth = allow_16_bits ? 16 : 8; - auto err_msg = "At most " + std::to_string(max_bit_depth) + - "-bit PNG images are supported currently."; - if (bit_depth > max_bit_depth) { + if (bit_depth > 8 && bit_depth != 16) { png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); - TORCH_CHECK(false, err_msg) + TORCH_CHECK( + false, + "bit depth of png image is " + std::to_string(bit_depth) + + ". Only <=8 and 16 are supported.") } int channels = png_get_channels(png_ptr, info_ptr); - if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) + if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) { png_set_expand_gray_1_2_4_to_8(png_ptr); + } int number_of_passes; if (interlace_type == PNG_INTERLACE_ADAM7) { @@ -193,48 +195,34 @@ torch::Tensor decode_png( } auto num_pixels_per_row = width * channels; + auto is_16_bits = bit_depth == 16; auto tensor = torch::empty( {int64_t(height), int64_t(width), channels}, - bit_depth <= 8 ? torch::kU8 : torch::kI32); - - if (bit_depth <= 8) { - auto t_ptr = tensor.accessor().data(); - for (int pass = 0; pass < number_of_passes; pass++) { - for (png_uint_32 i = 0; i < height; ++i) { - png_read_row(png_ptr, t_ptr, nullptr); - t_ptr += num_pixels_per_row; - } - t_ptr = tensor.accessor().data(); - } - } else { - // We're reading a 16bits png, but pytorch doesn't support uint16. - // So we read each row in a 16bits tmp_buffer which we then cast into - // a int32 tensor instead. - if (is_little_endian()) { - png_set_swap(png_ptr); - } - int32_t* t_ptr = tensor.accessor().data(); - - // We create a tensor instead of malloc-ing for automatic memory management - auto tmp_buffer_tensor = torch::empty( - {int64_t(num_pixels_per_row * sizeof(uint16_t))}, torch::kU8); - uint16_t* tmp_buffer = - (uint16_t*)tmp_buffer_tensor.accessor().data(); - - for (int pass = 0; pass < number_of_passes; pass++) { - for (png_uint_32 i = 0; i < height; ++i) { - png_read_row(png_ptr, (uint8_t*)tmp_buffer, nullptr); - // Now we copy the uint16 values into the int32 tensor. - for (size_t j = 0; j < num_pixels_per_row; ++j) { - t_ptr[j] = (int32_t)tmp_buffer[j]; - } - t_ptr += num_pixels_per_row; - } - t_ptr = tensor.accessor().data(); + is_16_bits ? at::kUInt16 : torch::kU8); + if (is_little_endian()) { + png_set_swap(png_ptr); + } + auto t_ptr = (uint8_t*)tensor.data_ptr(); + for (int pass = 0; pass < number_of_passes; pass++) { + for (png_uint_32 i = 0; i < height; ++i) { + png_read_row(png_ptr, t_ptr, nullptr); + t_ptr += num_pixels_per_row * (is_16_bits ? 2 : 1); } + t_ptr = (uint8_t*)tensor.data_ptr(); + } + + int exif_orientation = -1; + if (apply_exif_orientation) { + exif_orientation = fetch_png_exif_orientation(png_ptr, info_ptr); } + png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); - return tensor.permute({2, 0, 1}); + + auto output = tensor.permute({2, 0, 1}); + if (apply_exif_orientation) { + return exif_orientation_transform(output, exif_orientation); + } + return output; } #endif diff --git a/torchvision/csrc/io/image/cpu/decode_png.h b/torchvision/csrc/io/image/cpu/decode_png.h index fed89327cdb..faaffa7ae49 100644 --- a/torchvision/csrc/io/image/cpu/decode_png.h +++ b/torchvision/csrc/io/image/cpu/decode_png.h @@ -1,7 +1,7 @@ #pragma once #include -#include "../image_read_mode.h" +#include "../common.h" namespace vision { namespace image { @@ -9,7 +9,7 @@ namespace image { C10_EXPORT torch::Tensor decode_png( const torch::Tensor& data, ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED, - bool allow_16_bits = false); + bool apply_exif_orientation = false); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_webp.cpp b/torchvision/csrc/io/image/cpu/decode_webp.cpp new file mode 100644 index 00000000000..80fe68862fb --- /dev/null +++ b/torchvision/csrc/io/image/cpu/decode_webp.cpp @@ -0,0 +1,66 @@ +#include "decode_webp.h" +#include "../common.h" + +#if WEBP_FOUND +#include "webp/decode.h" +#include "webp/types.h" +#endif // WEBP_FOUND + +namespace vision { +namespace image { + +#if !WEBP_FOUND +torch::Tensor decode_webp( + const torch::Tensor& encoded_data, + ImageReadMode mode) { + TORCH_CHECK( + false, "decode_webp: torchvision not compiled with libwebp support"); +} +#else + +torch::Tensor decode_webp( + const torch::Tensor& encoded_data, + ImageReadMode mode) { + validate_encoded_data(encoded_data); + + auto encoded_data_p = encoded_data.data_ptr(); + auto encoded_data_size = encoded_data.numel(); + + WebPBitstreamFeatures features; + auto res = WebPGetFeatures(encoded_data_p, encoded_data_size, &features); + TORCH_CHECK( + res == VP8_STATUS_OK, "WebPGetFeatures failed with error code ", res); + TORCH_CHECK( + !features.has_animation, "Animated webp files are not supported."); + + if (mode == IMAGE_READ_MODE_GRAY || mode == IMAGE_READ_MODE_GRAY_ALPHA) { + TORCH_WARN_ONCE( + "Webp does not support grayscale conversions. " + "The returned tensor will be in the colorspace of the original image."); + } + + auto return_rgb = + should_this_return_rgb_or_rgba_let_me_know_in_the_comments_down_below_guys_see_you_in_the_next_video( + mode, features.has_alpha); + + auto decoding_func = return_rgb ? WebPDecodeRGB : WebPDecodeRGBA; + auto num_channels = return_rgb ? 3 : 4; + + int width = 0; + int height = 0; + + auto decoded_data = + decoding_func(encoded_data_p, encoded_data_size, &width, &height); + + TORCH_CHECK(decoded_data != nullptr, "WebPDecodeRGB[A] failed."); + + auto deleter = [decoded_data](void*) { WebPFree(decoded_data); }; + auto out = torch::from_blob( + decoded_data, {height, width, num_channels}, deleter, torch::kUInt8); + + return out.permute({2, 0, 1}); +} +#endif // WEBP_FOUND + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_webp.h b/torchvision/csrc/io/image/cpu/decode_webp.h new file mode 100644 index 00000000000..d5c81547c42 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/decode_webp.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "../common.h" + +namespace vision { +namespace image { + +C10_EXPORT torch::Tensor decode_webp( + const torch::Tensor& encoded_data, + ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED); + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/encode_png.cpp b/torchvision/csrc/io/image/cpu/encode_png.cpp index a9b7d76ff61..d55a0ed3ff6 100644 --- a/torchvision/csrc/io/image/cpu/encode_png.cpp +++ b/torchvision/csrc/io/image/cpu/encode_png.cpp @@ -47,13 +47,15 @@ void torch_png_write_data( size_t nsize = p->size + length; /* allocate or grow buffer */ - if (p->buffer) + if (p->buffer) { p->buffer = (char*)realloc(p->buffer, nsize); - else + } else { p->buffer = (char*)malloc(nsize); + } - if (!p->buffer) + if (!p->buffer) { png_error(png_ptr, "Write Error"); + } /* copy new bytes to end of buffer */ memcpy(p->buffer + p->size, data, length); @@ -71,7 +73,7 @@ torch::Tensor encode_png(const torch::Tensor& data, int64_t compression_level) { // Define output buffer struct torch_mem_encode buf_info; - buf_info.buffer = NULL; + buf_info.buffer = nullptr; buf_info.size = 0; /* Establish the setjmp return context for my_error_exit to use. */ @@ -79,15 +81,15 @@ torch::Tensor encode_png(const torch::Tensor& data, int64_t compression_level) { /* If we get here, the PNG code has signaled an error. * We need to clean up the PNG object and the buffer. */ - if (info_ptr != NULL) { + if (info_ptr != nullptr) { png_destroy_info_struct(png_write, &info_ptr); } - if (png_write != NULL) { - png_destroy_write_struct(&png_write, NULL); + if (png_write != nullptr) { + png_destroy_write_struct(&png_write, nullptr); } - if (buf_info.buffer != NULL) { + if (buf_info.buffer != nullptr) { free(buf_info.buffer); } @@ -121,12 +123,12 @@ torch::Tensor encode_png(const torch::Tensor& data, int64_t compression_level) { // Initialize PNG structures png_write = png_create_write_struct( - PNG_LIBPNG_VER_STRING, &err_ptr, torch_png_error, NULL); + PNG_LIBPNG_VER_STRING, &err_ptr, torch_png_error, nullptr); info_ptr = png_create_info_struct(png_write); // Define custom buffer output - png_set_write_fn(png_write, &buf_info, torch_png_write_data, NULL); + png_set_write_fn(png_write, &buf_info, torch_png_write_data, nullptr); // Set output image information auto color_type = channels == 1 ? PNG_COLOR_TYPE_GRAY : PNG_COLOR_TYPE_RGB; diff --git a/torchvision/csrc/io/image/cpu/exif.h b/torchvision/csrc/io/image/cpu/exif.h new file mode 100644 index 00000000000..7680737f8c0 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/exif.h @@ -0,0 +1,257 @@ +// @nolint (improperly imported third-party code) +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this +license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without +modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright +notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote +products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" +and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are +disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any +direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ +#pragma once +// Functions in this module are taken from OpenCV +// https://github.com/opencv/opencv/blob/097891e311fae1d8354eb092a0fd0171e630d78c/modules/imgcodecs/src/exif.cpp + +#if JPEG_FOUND +#include +#endif +#if PNG_FOUND +#include +#endif + +#include + +namespace vision { +namespace image { +namespace exif_private { + +constexpr uint16_t APP1 = 0xe1; +constexpr uint16_t ENDIANNESS_INTEL = 0x49; +constexpr uint16_t ENDIANNESS_MOTO = 0x4d; +constexpr uint16_t REQ_EXIF_TAG_MARK = 0x2a; +constexpr uint16_t ORIENTATION_EXIF_TAG = 0x0112; +constexpr uint16_t INCORRECT_TAG = -1; + +class ExifDataReader { + public: + ExifDataReader(unsigned char* p, size_t s) : _ptr(p), _size(s) {} + size_t size() const { + return _size; + } + const unsigned char& operator[](size_t index) const { + TORCH_CHECK(index >= 0 && index < _size); + return _ptr[index]; + } + + protected: + unsigned char* _ptr; + size_t _size; +}; + +inline uint16_t get_endianness(const ExifDataReader& exif_data) { + if ((exif_data.size() < 1) || + (exif_data.size() > 1 && exif_data[0] != exif_data[1])) { + return 0; + } + if (exif_data[0] == 'I') { + return ENDIANNESS_INTEL; + } + if (exif_data[0] == 'M') { + return ENDIANNESS_MOTO; + } + return 0; +} + +inline uint16_t get_uint16( + const ExifDataReader& exif_data, + uint16_t endianness, + const size_t offset) { + if (offset + 1 >= exif_data.size()) { + return INCORRECT_TAG; + } + + if (endianness == ENDIANNESS_INTEL) { + return exif_data[offset] + (exif_data[offset + 1] << 8); + } + return (exif_data[offset] << 8) + exif_data[offset + 1]; +} + +inline uint32_t get_uint32( + const ExifDataReader& exif_data, + uint16_t endianness, + const size_t offset) { + if (offset + 3 >= exif_data.size()) { + return INCORRECT_TAG; + } + + if (endianness == ENDIANNESS_INTEL) { + return exif_data[offset] + (exif_data[offset + 1] << 8) + + (exif_data[offset + 2] << 16) + (exif_data[offset + 3] << 24); + } + return (exif_data[offset] << 24) + (exif_data[offset + 1] << 16) + + (exif_data[offset + 2] << 8) + exif_data[offset + 3]; +} + +inline int fetch_exif_orientation(unsigned char* exif_data_ptr, size_t size) { + int exif_orientation = -1; + + // Exif binary structure looks like this + // First 6 bytes: [E, x, i, f, 0, 0] + // Endianness, 2 bytes : [M, M] or [I, I] + // Tag mark, 2 bytes: [0, 0x2a] + // Offset, 4 bytes + // Num entries, 2 bytes + // Tag entries and data, tag has 2 bytes and its data has 10 bytes + // For more details: + // http://www.media.mit.edu/pia/Research/deepview/exif.html + + ExifDataReader exif_data(exif_data_ptr, size); + auto endianness = get_endianness(exif_data); + + // Checking whether Tag Mark (0x002A) correspond to one contained in the + // Jpeg file + uint16_t tag_mark = get_uint16(exif_data, endianness, 2); + if (tag_mark == REQ_EXIF_TAG_MARK) { + auto offset = get_uint32(exif_data, endianness, 4); + size_t num_entry = get_uint16(exif_data, endianness, offset); + offset += 2; // go to start of tag fields + constexpr size_t tiff_field_size = 12; + for (size_t entry = 0; entry < num_entry; entry++) { + // Here we just search for orientation tag and parse it + auto tag_num = get_uint16(exif_data, endianness, offset); + if (tag_num == INCORRECT_TAG) { + break; + } + if (tag_num == ORIENTATION_EXIF_TAG) { + exif_orientation = get_uint16(exif_data, endianness, offset + 8); + break; + } + offset += tiff_field_size; + } + } + return exif_orientation; +} + +#if JPEG_FOUND +inline int fetch_jpeg_exif_orientation(j_decompress_ptr cinfo) { + // Check for Exif marker APP1 + jpeg_saved_marker_ptr exif_marker = 0; + jpeg_saved_marker_ptr cmarker = cinfo->marker_list; + while (cmarker && exif_marker == 0) { + if (cmarker->marker == APP1) { + exif_marker = cmarker; + } + cmarker = cmarker->next; + } + + if (!exif_marker) { + return -1; + } + + constexpr size_t start_offset = 6; + if (exif_marker->data_length <= start_offset) { + return -1; + } + + auto* exif_data_ptr = exif_marker->data + start_offset; + auto size = exif_marker->data_length - start_offset; + + return fetch_exif_orientation(exif_data_ptr, size); +} +#endif // #if JPEG_FOUND + +#if PNG_FOUND && defined(PNG_eXIf_SUPPORTED) +inline int fetch_png_exif_orientation(png_structp png_ptr, png_infop info_ptr) { + png_uint_32 num_exif = 0; + png_bytep exif = 0; + + // Exif info could be in info_ptr + if (png_get_valid(png_ptr, info_ptr, PNG_INFO_eXIf)) { + png_get_eXIf_1(png_ptr, info_ptr, &num_exif, &exif); + } + + if (exif && num_exif > 0) { + return fetch_exif_orientation(exif, num_exif); + } + return -1; +} +#endif // #if PNG_FOUND && defined(PNG_eXIf_SUPPORTED) + +constexpr uint16_t IMAGE_ORIENTATION_TL = 1; // normal orientation +constexpr uint16_t IMAGE_ORIENTATION_TR = 2; // needs horizontal flip +constexpr uint16_t IMAGE_ORIENTATION_BR = 3; // needs 180 rotation +constexpr uint16_t IMAGE_ORIENTATION_BL = 4; // needs vertical flip +constexpr uint16_t IMAGE_ORIENTATION_LT = + 5; // mirrored horizontal & rotate 270 CW +constexpr uint16_t IMAGE_ORIENTATION_RT = 6; // rotate 90 CW +constexpr uint16_t IMAGE_ORIENTATION_RB = + 7; // mirrored horizontal & rotate 90 CW +constexpr uint16_t IMAGE_ORIENTATION_LB = 8; // needs 270 CW rotation + +inline torch::Tensor exif_orientation_transform( + const torch::Tensor& image, + int orientation) { + if (orientation == IMAGE_ORIENTATION_TL) { + return image; + } else if (orientation == IMAGE_ORIENTATION_TR) { + return image.flip(-1); + } else if (orientation == IMAGE_ORIENTATION_BR) { + // needs 180 rotation equivalent to + // flip both horizontally and vertically + return image.flip({-2, -1}); + } else if (orientation == IMAGE_ORIENTATION_BL) { + return image.flip(-2); + } else if (orientation == IMAGE_ORIENTATION_LT) { + return image.transpose(-1, -2); + } else if (orientation == IMAGE_ORIENTATION_RT) { + return image.transpose(-1, -2).flip(-1); + } else if (orientation == IMAGE_ORIENTATION_RB) { + return image.transpose(-1, -2).flip({-2, -1}); + } else if (orientation == IMAGE_ORIENTATION_LB) { + return image.transpose(-1, -2).flip(-2); + } + return image; +} + +} // namespace exif_private +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/giflib/README b/torchvision/csrc/io/image/cpu/giflib/README new file mode 100644 index 00000000000..7353453e32e --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/README @@ -0,0 +1,28 @@ +These files come from the GIFLIB project (https://giflib.sourceforge.net/) and +are licensed under the MIT license. + +Some modifications have been made to the original files: +- Remove use of "register" keyword in gifalloc.c for C++17 compatibility. +- Declare loop variable i in DGifGetImageHeader as int instead of unsigned int. + +Below is the original license text from the COPYING file of the GIFLIB project: + += MIT LICENSE + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c b/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c new file mode 100644 index 00000000000..7d35fff87ee --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c @@ -0,0 +1,1313 @@ +// @nolint (improperly imported third-party code) +/****************************************************************************** + +dgif_lib.c - GIF decoding + +The functions here and in egif_lib.c are partitioned carefully so that +if you only require one of read and write capability, only one of these +two modules will be linked. Preserve this property! + +*****************************************************************************/ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright (C) Eric S. Raymond + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#endif /* _WIN32 */ + +#include "gif_lib.h" +#include "gif_lib_private.h" + +/* compose unsigned little endian value */ +#define UNSIGNED_LITTLE_ENDIAN(lo, hi) ((lo) | ((hi) << 8)) + +/* avoid extra function call in case we use fread (TVT) */ +static int InternalRead(GifFileType *gif, GifByteType *buf, int len) { + // fprintf(stderr, "### Read: %d\n", len); + return (((GifFilePrivateType *)gif->Private)->Read + ? ((GifFilePrivateType *)gif->Private)->Read(gif, buf, len) + : fread(buf, 1, len, + ((GifFilePrivateType *)gif->Private)->File)); +} + +static int DGifGetWord(GifFileType *GifFile, GifWord *Word); +static int DGifSetupDecompress(GifFileType *GifFile); +static int DGifDecompressLine(GifFileType *GifFile, GifPixelType *Line, + int LineLen); +static int DGifGetPrefixChar(const GifPrefixType *Prefix, int Code, + int ClearCode); +static int DGifDecompressInput(GifFileType *GifFile, int *Code); +static int DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf, + GifByteType *NextByte); + +/****************************************************************************** + Open a new GIF file for read, given by its name. + Returns dynamically allocated GifFileType pointer which serves as the GIF + info record. +******************************************************************************/ +GifFileType *DGifOpenFileName(const char *FileName, int *Error) { + int FileHandle; + GifFileType *GifFile; + + if ((FileHandle = open(FileName, O_RDONLY)) == -1) { + if (Error != NULL) { + *Error = D_GIF_ERR_OPEN_FAILED; + } + return NULL; + } + + GifFile = DGifOpenFileHandle(FileHandle, Error); + return GifFile; +} + +/****************************************************************************** + Update a new GIF file, given its file handle. + Returns dynamically allocated GifFileType pointer which serves as the GIF + info record. +******************************************************************************/ +GifFileType *DGifOpenFileHandle(int FileHandle, int *Error) { + char Buf[GIF_STAMP_LEN + 1]; + GifFileType *GifFile; + GifFilePrivateType *Private; + FILE *f; + + GifFile = (GifFileType *)malloc(sizeof(GifFileType)); + if (GifFile == NULL) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + (void)close(FileHandle); + return NULL; + } + + /*@i1@*/ memset(GifFile, '\0', sizeof(GifFileType)); + + /* Belt and suspenders, in case the null pointer isn't zero */ + GifFile->SavedImages = NULL; + GifFile->SColorMap = NULL; + + Private = (GifFilePrivateType *)calloc(1, sizeof(GifFilePrivateType)); + if (Private == NULL) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + (void)close(FileHandle); + free((char *)GifFile); + return NULL; + } + + /*@i1@*/ memset(Private, '\0', sizeof(GifFilePrivateType)); + +#ifdef _WIN32 + _setmode(FileHandle, O_BINARY); /* Make sure it is in binary mode. */ +#endif /* _WIN32 */ + + f = fdopen(FileHandle, "rb"); /* Make it into a stream: */ + + /*@-mustfreeonly@*/ + GifFile->Private = (void *)Private; + Private->FileHandle = FileHandle; + Private->File = f; + Private->FileState = FILE_STATE_READ; + Private->Read = NULL; /* don't use alternate input method (TVT) */ + GifFile->UserData = NULL; /* TVT */ + /*@=mustfreeonly@*/ + + /* Let's see if this is a GIF file: */ + /* coverity[check_return] */ + if (InternalRead(GifFile, (unsigned char *)Buf, GIF_STAMP_LEN) != + GIF_STAMP_LEN) { + if (Error != NULL) { + *Error = D_GIF_ERR_READ_FAILED; + } + (void)fclose(f); + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + /* Check for GIF prefix at start of file */ + Buf[GIF_STAMP_LEN] = 0; + if (strncmp(GIF_STAMP, Buf, GIF_VERSION_POS) != 0) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_GIF_FILE; + } + (void)fclose(f); + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + if (DGifGetScreenDesc(GifFile) == GIF_ERROR) { + (void)fclose(f); + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + GifFile->Error = 0; + + /* What version of GIF? */ + Private->gif89 = (Buf[GIF_VERSION_POS + 1] == '9'); + + return GifFile; +} + +/****************************************************************************** + GifFileType constructor with user supplied input function (TVT) +******************************************************************************/ +GifFileType *DGifOpen(void *userData, InputFunc readFunc, int *Error) { + char Buf[GIF_STAMP_LEN + 1]; + GifFileType *GifFile; + GifFilePrivateType *Private; + + GifFile = (GifFileType *)malloc(sizeof(GifFileType)); + if (GifFile == NULL) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + return NULL; + } + + memset(GifFile, '\0', sizeof(GifFileType)); + + /* Belt and suspenders, in case the null pointer isn't zero */ + GifFile->SavedImages = NULL; + GifFile->SColorMap = NULL; + + Private = (GifFilePrivateType *)calloc(1, sizeof(GifFilePrivateType)); + if (!Private) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + free((char *)GifFile); + return NULL; + } + /*@i1@*/ memset(Private, '\0', sizeof(GifFilePrivateType)); + + GifFile->Private = (void *)Private; + Private->FileHandle = 0; + Private->File = NULL; + Private->FileState = FILE_STATE_READ; + + Private->Read = readFunc; /* TVT */ + GifFile->UserData = userData; /* TVT */ + + /* Lets see if this is a GIF file: */ + /* coverity[check_return] */ + if (InternalRead(GifFile, (unsigned char *)Buf, GIF_STAMP_LEN) != + GIF_STAMP_LEN) { + if (Error != NULL) { + *Error = D_GIF_ERR_READ_FAILED; + } + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + /* Check for GIF prefix at start of file */ + Buf[GIF_STAMP_LEN] = '\0'; + if (strncmp(GIF_STAMP, Buf, GIF_VERSION_POS) != 0) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_GIF_FILE; + } + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + if (DGifGetScreenDesc(GifFile) == GIF_ERROR) { + free((char *)Private); + free((char *)GifFile); + if (Error != NULL) { + *Error = D_GIF_ERR_NO_SCRN_DSCR; + } + return NULL; + } + + GifFile->Error = 0; + + /* What version of GIF? */ + Private->gif89 = (Buf[GIF_VERSION_POS + 1] == '9'); + + return GifFile; +} + +/****************************************************************************** + This routine should be called before any other DGif calls. Note that + this routine is called automatically from DGif file open routines. +******************************************************************************/ +int DGifGetScreenDesc(GifFileType *GifFile) { + int BitsPerPixel; + bool SortFlag; + GifByteType Buf[3]; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + /* Put the screen descriptor into the file: */ + if (DGifGetWord(GifFile, &GifFile->SWidth) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->SHeight) == GIF_ERROR) { + return GIF_ERROR; + } + + if (InternalRead(GifFile, Buf, 3) != 3) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + GifFreeMapObject(GifFile->SColorMap); + GifFile->SColorMap = NULL; + return GIF_ERROR; + } + GifFile->SColorResolution = (((Buf[0] & 0x70) + 1) >> 4) + 1; + SortFlag = (Buf[0] & 0x08) != 0; + BitsPerPixel = (Buf[0] & 0x07) + 1; + GifFile->SBackGroundColor = Buf[1]; + GifFile->AspectByte = Buf[2]; + if (Buf[0] & 0x80) { /* Do we have global color map? */ + int i; + + GifFile->SColorMap = GifMakeMapObject(1 << BitsPerPixel, NULL); + if (GifFile->SColorMap == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + + /* Get the global color map: */ + GifFile->SColorMap->SortFlag = SortFlag; + for (i = 0; i < GifFile->SColorMap->ColorCount; i++) { + /* coverity[check_return] */ + if (InternalRead(GifFile, Buf, 3) != 3) { + GifFreeMapObject(GifFile->SColorMap); + GifFile->SColorMap = NULL; + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + GifFile->SColorMap->Colors[i].Red = Buf[0]; + GifFile->SColorMap->Colors[i].Green = Buf[1]; + GifFile->SColorMap->Colors[i].Blue = Buf[2]; + } + } else { + GifFile->SColorMap = NULL; + } + + /* + * No check here for whether the background color is in range for the + * screen color map. Possibly there should be. + */ + + return GIF_OK; +} + +const char *DGifGetGifVersion(GifFileType *GifFile) { + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (Private->gif89) { + return GIF89_STAMP; + } else { + return GIF87_STAMP; + } +} + +/****************************************************************************** + This routine should be called before any attempt to read an image. +******************************************************************************/ +int DGifGetRecordType(GifFileType *GifFile, GifRecordType *Type) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + /* coverity[check_return] */ + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + + // fprintf(stderr, "### DGifGetRecordType: %02x\n", Buf); + switch (Buf) { + case DESCRIPTOR_INTRODUCER: + *Type = IMAGE_DESC_RECORD_TYPE; + break; + case EXTENSION_INTRODUCER: + *Type = EXTENSION_RECORD_TYPE; + break; + case TERMINATOR_INTRODUCER: + *Type = TERMINATE_RECORD_TYPE; + break; + default: + *Type = UNDEFINED_RECORD_TYPE; + GifFile->Error = D_GIF_ERR_WRONG_RECORD; + return GIF_ERROR; + } + + return GIF_OK; +} + +int DGifGetImageHeader(GifFileType *GifFile) { + unsigned int BitsPerPixel; + GifByteType Buf[3]; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (DGifGetWord(GifFile, &GifFile->Image.Left) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->Image.Top) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->Image.Width) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->Image.Height) == GIF_ERROR) { + return GIF_ERROR; + } + if (InternalRead(GifFile, Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Image.ColorMap = NULL; + return GIF_ERROR; + } + BitsPerPixel = (Buf[0] & 0x07) + 1; + GifFile->Image.Interlace = (Buf[0] & 0x40) ? true : false; + + /* Setup the colormap */ + if (GifFile->Image.ColorMap) { + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Image.ColorMap = NULL; + } + /* Does this image have local color map? */ + if (Buf[0] & 0x80) { + int i; + + GifFile->Image.ColorMap = + GifMakeMapObject(1 << BitsPerPixel, NULL); + if (GifFile->Image.ColorMap == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + + /* Get the image local color map: */ + for (i = 0; i < GifFile->Image.ColorMap->ColorCount; i++) { + /* coverity[check_return] */ + if (InternalRead(GifFile, Buf, 3) != 3) { + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Error = D_GIF_ERR_READ_FAILED; + GifFile->Image.ColorMap = NULL; + return GIF_ERROR; + } + GifFile->Image.ColorMap->Colors[i].Red = Buf[0]; + GifFile->Image.ColorMap->Colors[i].Green = Buf[1]; + GifFile->Image.ColorMap->Colors[i].Blue = Buf[2]; + } + } + + Private->PixelCount = + (long)GifFile->Image.Width * (long)GifFile->Image.Height; + + /* Reset decompress algorithm parameters. */ + return DGifSetupDecompress(GifFile); +} + +/****************************************************************************** + This routine should be called before any attempt to read an image. + Note it is assumed the Image desc. header has been read. +******************************************************************************/ +int DGifGetImageDesc(GifFileType *GifFile) { + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + SavedImage *sp; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (DGifGetImageHeader(GifFile) == GIF_ERROR) { + return GIF_ERROR; + } + + if (GifFile->SavedImages) { + SavedImage *new_saved_images = (SavedImage *)reallocarray( + GifFile->SavedImages, (GifFile->ImageCount + 1), + sizeof(SavedImage)); + if (new_saved_images == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + GifFile->SavedImages = new_saved_images; + } else { + if ((GifFile->SavedImages = + (SavedImage *)malloc(sizeof(SavedImage))) == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + } + + sp = &GifFile->SavedImages[GifFile->ImageCount]; + memcpy(&sp->ImageDesc, &GifFile->Image, sizeof(GifImageDesc)); + if (GifFile->Image.ColorMap != NULL) { + sp->ImageDesc.ColorMap = + GifMakeMapObject(GifFile->Image.ColorMap->ColorCount, + GifFile->Image.ColorMap->Colors); + if (sp->ImageDesc.ColorMap == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + } + sp->RasterBits = (unsigned char *)NULL; + sp->ExtensionBlockCount = 0; + sp->ExtensionBlocks = (ExtensionBlock *)NULL; + + GifFile->ImageCount++; + + return GIF_OK; +} + +/****************************************************************************** + Get one full scanned line (Line) of length LineLen from GIF file. +******************************************************************************/ +int DGifGetLine(GifFileType *GifFile, GifPixelType *Line, int LineLen) { + GifByteType *Dummy; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (!LineLen) { + LineLen = GifFile->Image.Width; + } + + if ((Private->PixelCount -= LineLen) > 0xffff0000UL) { + GifFile->Error = D_GIF_ERR_DATA_TOO_BIG; + return GIF_ERROR; + } + + if (DGifDecompressLine(GifFile, Line, LineLen) == GIF_OK) { + if (Private->PixelCount == 0) { + /* We probably won't be called any more, so let's clean + * up everything before we return: need to flush out all + * the rest of image until an empty block (size 0) + * detected. We use GetCodeNext. + */ + do { + if (DGifGetCodeNext(GifFile, &Dummy) == + GIF_ERROR) { + return GIF_ERROR; + } + } while (Dummy != NULL); + } + return GIF_OK; + } else { + return GIF_ERROR; + } +} + +/****************************************************************************** + Put one pixel (Pixel) into GIF file. +******************************************************************************/ +int DGifGetPixel(GifFileType *GifFile, GifPixelType Pixel) { + GifByteType *Dummy; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + if (--Private->PixelCount > 0xffff0000UL) { + GifFile->Error = D_GIF_ERR_DATA_TOO_BIG; + return GIF_ERROR; + } + + if (DGifDecompressLine(GifFile, &Pixel, 1) == GIF_OK) { + if (Private->PixelCount == 0) { + /* We probably won't be called any more, so let's clean + * up everything before we return: need to flush out all + * the rest of image until an empty block (size 0) + * detected. We use GetCodeNext. + */ + do { + if (DGifGetCodeNext(GifFile, &Dummy) == + GIF_ERROR) { + return GIF_ERROR; + } + } while (Dummy != NULL); + } + return GIF_OK; + } else { + return GIF_ERROR; + } +} + +/****************************************************************************** + Get an extension block (see GIF manual) from GIF file. This routine only + returns the first data block, and DGifGetExtensionNext should be called + after this one until NULL extension is returned. + The Extension should NOT be freed by the user (not dynamically allocated). + Note it is assumed the Extension description header has been read. +******************************************************************************/ +int DGifGetExtension(GifFileType *GifFile, int *ExtCode, + GifByteType **Extension) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + // fprintf(stderr, "### -> DGifGetExtension:\n"); + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + /* coverity[check_return] */ + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + *ExtCode = Buf; + // fprintf(stderr, "### <- DGifGetExtension: %02x, about to call + // next\n", Buf); + + return DGifGetExtensionNext(GifFile, Extension); +} + +/****************************************************************************** + Get a following extension block (see GIF manual) from GIF file. This + routine should be called until NULL Extension is returned. + The Extension should NOT be freed by the user (not dynamically allocated). +******************************************************************************/ +int DGifGetExtensionNext(GifFileType *GifFile, GifByteType **Extension) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + // fprintf(stderr, "### -> DGifGetExtensionNext\n"); + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + // fprintf(stderr, "### DGifGetExtensionNext sees %d\n", Buf); + + if (Buf > 0) { + *Extension = Private->Buf; /* Use private unused buffer. */ + (*Extension)[0] = + Buf; /* Pascal strings notation (pos. 0 is len.). */ + /* coverity[tainted_data,check_return] */ + if (InternalRead(GifFile, &((*Extension)[1]), Buf) != Buf) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + } else { + *Extension = NULL; + } + // fprintf(stderr, "### <- DGifGetExtensionNext: %p\n", Extension); + + return GIF_OK; +} + +/****************************************************************************** + Extract a Graphics Control Block from raw extension data +******************************************************************************/ + +int DGifExtensionToGCB(const size_t GifExtensionLength, + const GifByteType *GifExtension, + GraphicsControlBlock *GCB) { + if (GifExtensionLength != 4) { + return GIF_ERROR; + } + + GCB->DisposalMode = (GifExtension[0] >> 2) & 0x07; + GCB->UserInputFlag = (GifExtension[0] & 0x02) != 0; + GCB->DelayTime = + UNSIGNED_LITTLE_ENDIAN(GifExtension[1], GifExtension[2]); + if (GifExtension[0] & 0x01) { + GCB->TransparentColor = (int)GifExtension[3]; + } else { + GCB->TransparentColor = NO_TRANSPARENT_COLOR; + } + + return GIF_OK; +} + +/****************************************************************************** + Extract the Graphics Control Block for a saved image, if it exists. +******************************************************************************/ + +int DGifSavedExtensionToGCB(GifFileType *GifFile, int ImageIndex, + GraphicsControlBlock *GCB) { + int i; + + if (ImageIndex < 0 || ImageIndex > GifFile->ImageCount - 1) { + return GIF_ERROR; + } + + GCB->DisposalMode = DISPOSAL_UNSPECIFIED; + GCB->UserInputFlag = false; + GCB->DelayTime = 0; + GCB->TransparentColor = NO_TRANSPARENT_COLOR; + + for (i = 0; i < GifFile->SavedImages[ImageIndex].ExtensionBlockCount; + i++) { + ExtensionBlock *ep = + &GifFile->SavedImages[ImageIndex].ExtensionBlocks[i]; + if (ep->Function == GRAPHICS_EXT_FUNC_CODE) { + return DGifExtensionToGCB(ep->ByteCount, ep->Bytes, + GCB); + } + } + + return GIF_ERROR; +} + +/****************************************************************************** + This routine should be called last, to close the GIF file. +******************************************************************************/ +int DGifCloseFile(GifFileType *GifFile, int *ErrorCode) { + GifFilePrivateType *Private; + + if (GifFile == NULL || GifFile->Private == NULL) { + return GIF_ERROR; + } + + if (GifFile->Image.ColorMap) { + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Image.ColorMap = NULL; + } + + if (GifFile->SColorMap) { + GifFreeMapObject(GifFile->SColorMap); + GifFile->SColorMap = NULL; + } + + if (GifFile->SavedImages) { + GifFreeSavedImages(GifFile); + GifFile->SavedImages = NULL; + } + + GifFreeExtensions(&GifFile->ExtensionBlockCount, + &GifFile->ExtensionBlocks); + + Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + if (ErrorCode != NULL) { + *ErrorCode = D_GIF_ERR_NOT_READABLE; + } + free((char *)GifFile->Private); + free(GifFile); + return GIF_ERROR; + } + + if (Private->File && (fclose(Private->File) != 0)) { + if (ErrorCode != NULL) { + *ErrorCode = D_GIF_ERR_CLOSE_FAILED; + } + free((char *)GifFile->Private); + free(GifFile); + return GIF_ERROR; + } + + free((char *)GifFile->Private); + free(GifFile); + if (ErrorCode != NULL) { + *ErrorCode = D_GIF_SUCCEEDED; + } + return GIF_OK; +} + +/****************************************************************************** + Get 2 bytes (word) from the given file: +******************************************************************************/ +static int DGifGetWord(GifFileType *GifFile, GifWord *Word) { + unsigned char c[2]; + + /* coverity[check_return] */ + if (InternalRead(GifFile, c, 2) != 2) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + + *Word = (GifWord)UNSIGNED_LITTLE_ENDIAN(c[0], c[1]); + return GIF_OK; +} + +/****************************************************************************** + Get the image code in compressed form. This routine can be called if the + information needed to be piped out as is. Obviously this is much faster + than decoding and encoding again. This routine should be followed by calls + to DGifGetCodeNext, until NULL block is returned. + The block should NOT be freed by the user (not dynamically allocated). +******************************************************************************/ +int DGifGetCode(GifFileType *GifFile, int *CodeSize, GifByteType **CodeBlock) { + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + *CodeSize = Private->BitsPerPixel; + + return DGifGetCodeNext(GifFile, CodeBlock); +} + +/****************************************************************************** + Continue to get the image code in compressed form. This routine should be + called until NULL block is returned. + The block should NOT be freed by the user (not dynamically allocated). +******************************************************************************/ +int DGifGetCodeNext(GifFileType *GifFile, GifByteType **CodeBlock) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + /* coverity[tainted_data_argument] */ + /* coverity[check_return] */ + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + + /* coverity[lower_bounds] */ + if (Buf > 0) { + *CodeBlock = Private->Buf; /* Use private unused buffer. */ + (*CodeBlock)[0] = + Buf; /* Pascal strings notation (pos. 0 is len.). */ + /* coverity[tainted_data] */ + if (InternalRead(GifFile, &((*CodeBlock)[1]), Buf) != Buf) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + } else { + *CodeBlock = NULL; + Private->Buf[0] = 0; /* Make sure the buffer is empty! */ + Private->PixelCount = + 0; /* And local info. indicate image read. */ + } + + return GIF_OK; +} + +/****************************************************************************** + Setup the LZ decompression for this image: +******************************************************************************/ +static int DGifSetupDecompress(GifFileType *GifFile) { + int i, BitsPerPixel; + GifByteType CodeSize; + GifPrefixType *Prefix; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + /* coverity[check_return] */ + if (InternalRead(GifFile, &CodeSize, 1) < + 1) { /* Read Code size from file. */ + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; /* Failed to read Code size. */ + } + BitsPerPixel = CodeSize; + + /* this can only happen on a severely malformed GIF */ + if (BitsPerPixel > 8) { + GifFile->Error = + D_GIF_ERR_READ_FAILED; /* somewhat bogus error code */ + return GIF_ERROR; /* Failed to read Code size. */ + } + + Private->Buf[0] = 0; /* Input Buffer empty. */ + Private->BitsPerPixel = BitsPerPixel; + Private->ClearCode = (1 << BitsPerPixel); + Private->EOFCode = Private->ClearCode + 1; + Private->RunningCode = Private->EOFCode + 1; + Private->RunningBits = BitsPerPixel + 1; /* Number of bits per code. */ + Private->MaxCode1 = 1 << Private->RunningBits; /* Max. code + 1. */ + Private->StackPtr = 0; /* No pixels on the pixel stack. */ + Private->LastCode = NO_SUCH_CODE; + Private->CrntShiftState = 0; /* No information in CrntShiftDWord. */ + Private->CrntShiftDWord = 0; + + Prefix = Private->Prefix; + for (i = 0; i <= LZ_MAX_CODE; i++) { + Prefix[i] = NO_SUCH_CODE; + } + + return GIF_OK; +} + +/****************************************************************************** + The LZ decompression routine: + This version decompress the given GIF file into Line of length LineLen. + This routine can be called few times (one per scan line, for example), in + order the complete the whole image. +******************************************************************************/ +static int DGifDecompressLine(GifFileType *GifFile, GifPixelType *Line, + int LineLen) { + int i = 0; + int j, CrntCode, EOFCode, ClearCode, CrntPrefix, LastCode, StackPtr; + GifByteType *Stack, *Suffix; + GifPrefixType *Prefix; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + StackPtr = Private->StackPtr; + Prefix = Private->Prefix; + Suffix = Private->Suffix; + Stack = Private->Stack; + EOFCode = Private->EOFCode; + ClearCode = Private->ClearCode; + LastCode = Private->LastCode; + + if (StackPtr > LZ_MAX_CODE) { + return GIF_ERROR; + } + + if (StackPtr != 0) { + /* Let pop the stack off before continueing to read the GIF + * file: */ + while (StackPtr != 0 && i < LineLen) { + Line[i++] = Stack[--StackPtr]; + } + } + + while (i < LineLen) { /* Decode LineLen items. */ + if (DGifDecompressInput(GifFile, &CrntCode) == GIF_ERROR) { + return GIF_ERROR; + } + + if (CrntCode == EOFCode) { + /* Note however that usually we will not be here as we + * will stop decoding as soon as we got all the pixel, + * or EOF code will not be read at all, and + * DGifGetLine/Pixel clean everything. */ + GifFile->Error = D_GIF_ERR_EOF_TOO_SOON; + return GIF_ERROR; + } else if (CrntCode == ClearCode) { + /* We need to start over again: */ + for (j = 0; j <= LZ_MAX_CODE; j++) { + Prefix[j] = NO_SUCH_CODE; + } + Private->RunningCode = Private->EOFCode + 1; + Private->RunningBits = Private->BitsPerPixel + 1; + Private->MaxCode1 = 1 << Private->RunningBits; + LastCode = Private->LastCode = NO_SUCH_CODE; + } else { + /* Its regular code - if in pixel range simply add it to + * output stream, otherwise trace to codes linked list + * until the prefix is in pixel range: */ + if (CrntCode < ClearCode) { + /* This is simple - its pixel scalar, so add it + * to output: */ + Line[i++] = CrntCode; + } else { + /* Its a code to needed to be traced: trace the + * linked list until the prefix is a pixel, + * while pushing the suffix pixels on our stack. + * If we done, pop the stack in reverse (thats + * what stack is good for!) order to output. */ + if (Prefix[CrntCode] == NO_SUCH_CODE) { + CrntPrefix = LastCode; + + /* Only allowed if CrntCode is exactly + * the running code: In that case + * CrntCode = XXXCode, CrntCode or the + * prefix code is last code and the + * suffix char is exactly the prefix of + * last code! */ + if (CrntCode == + Private->RunningCode - 2) { + Suffix[Private->RunningCode - + 2] = Stack[StackPtr++] = + DGifGetPrefixChar( + Prefix, LastCode, + ClearCode); + } else { + Suffix[Private->RunningCode - + 2] = Stack[StackPtr++] = + DGifGetPrefixChar( + Prefix, CrntCode, + ClearCode); + } + } else { + CrntPrefix = CrntCode; + } + + /* Now (if image is O.K.) we should not get a + * NO_SUCH_CODE during the trace. As we might + * loop forever, in case of defective image, we + * use StackPtr as loop counter and stop before + * overflowing Stack[]. */ + while (StackPtr < LZ_MAX_CODE && + CrntPrefix > ClearCode && + CrntPrefix <= LZ_MAX_CODE) { + Stack[StackPtr++] = Suffix[CrntPrefix]; + CrntPrefix = Prefix[CrntPrefix]; + } + if (StackPtr >= LZ_MAX_CODE || + CrntPrefix > LZ_MAX_CODE) { + GifFile->Error = D_GIF_ERR_IMAGE_DEFECT; + return GIF_ERROR; + } + /* Push the last character on stack: */ + Stack[StackPtr++] = CrntPrefix; + + /* Now lets pop all the stack into output: */ + while (StackPtr != 0 && i < LineLen) { + Line[i++] = Stack[--StackPtr]; + } + } + if (LastCode != NO_SUCH_CODE && + Private->RunningCode - 2 < (LZ_MAX_CODE + 1) && + Prefix[Private->RunningCode - 2] == NO_SUCH_CODE) { + Prefix[Private->RunningCode - 2] = LastCode; + + if (CrntCode == Private->RunningCode - 2) { + /* Only allowed if CrntCode is exactly + * the running code: In that case + * CrntCode = XXXCode, CrntCode or the + * prefix code is last code and the + * suffix char is exactly the prefix of + * last code! */ + Suffix[Private->RunningCode - 2] = + DGifGetPrefixChar(Prefix, LastCode, + ClearCode); + } else { + Suffix[Private->RunningCode - 2] = + DGifGetPrefixChar(Prefix, CrntCode, + ClearCode); + } + } + LastCode = CrntCode; + } + } + + Private->LastCode = LastCode; + Private->StackPtr = StackPtr; + + return GIF_OK; +} + +/****************************************************************************** + Routine to trace the Prefixes linked list until we get a prefix which is + not code, but a pixel value (less than ClearCode). Returns that pixel value. + If image is defective, we might loop here forever, so we limit the loops to + the maximum possible if image O.k. - LZ_MAX_CODE times. +******************************************************************************/ +static int DGifGetPrefixChar(const GifPrefixType *Prefix, int Code, + int ClearCode) { + int i = 0; + + while (Code > ClearCode && i++ <= LZ_MAX_CODE) { + if (Code > LZ_MAX_CODE) { + return NO_SUCH_CODE; + } + Code = Prefix[Code]; + } + return Code; +} + +/****************************************************************************** + Interface for accessing the LZ codes directly. Set Code to the real code + (12bits), or to -1 if EOF code is returned. +******************************************************************************/ +int DGifGetLZCodes(GifFileType *GifFile, int *Code) { + GifByteType *CodeBlock; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (DGifDecompressInput(GifFile, Code) == GIF_ERROR) { + return GIF_ERROR; + } + + if (*Code == Private->EOFCode) { + /* Skip rest of codes (hopefully only NULL terminating block): + */ + do { + if (DGifGetCodeNext(GifFile, &CodeBlock) == GIF_ERROR) { + return GIF_ERROR; + } + } while (CodeBlock != NULL); + + *Code = -1; + } else if (*Code == Private->ClearCode) { + /* We need to start over again: */ + Private->RunningCode = Private->EOFCode + 1; + Private->RunningBits = Private->BitsPerPixel + 1; + Private->MaxCode1 = 1 << Private->RunningBits; + } + + return GIF_OK; +} + +/****************************************************************************** + The LZ decompression input routine: + This routine is responsable for the decompression of the bit stream from + 8 bits (bytes) packets, into the real codes. + Returns GIF_OK if read successfully. +******************************************************************************/ +static int DGifDecompressInput(GifFileType *GifFile, int *Code) { + static const unsigned short CodeMasks[] = { + 0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, + 0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff}; + + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + GifByteType NextByte; + + /* The image can't contain more than LZ_BITS per code. */ + if (Private->RunningBits > LZ_BITS) { + GifFile->Error = D_GIF_ERR_IMAGE_DEFECT; + return GIF_ERROR; + } + + while (Private->CrntShiftState < Private->RunningBits) { + /* Needs to get more bytes from input stream for next code: */ + if (DGifBufferedInput(GifFile, Private->Buf, &NextByte) == + GIF_ERROR) { + return GIF_ERROR; + } + Private->CrntShiftDWord |= ((unsigned long)NextByte) + << Private->CrntShiftState; + Private->CrntShiftState += 8; + } + *Code = Private->CrntShiftDWord & CodeMasks[Private->RunningBits]; + + Private->CrntShiftDWord >>= Private->RunningBits; + Private->CrntShiftState -= Private->RunningBits; + + /* If code cannot fit into RunningBits bits, must raise its size. Note + * however that codes above 4095 are used for special signaling. + * If we're using LZ_BITS bits already and we're at the max code, just + * keep using the table as it is, don't increment Private->RunningCode. + */ + if (Private->RunningCode < LZ_MAX_CODE + 2 && + ++Private->RunningCode > Private->MaxCode1 && + Private->RunningBits < LZ_BITS) { + Private->MaxCode1 <<= 1; + Private->RunningBits++; + } + return GIF_OK; +} + +/****************************************************************************** + This routines read one GIF data block at a time and buffers it internally + so that the decompression routine could access it. + The routine returns the next byte from its internal buffer (or read next + block in if buffer empty) and returns GIF_OK if succesful. +******************************************************************************/ +static int DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf, + GifByteType *NextByte) { + if (Buf[0] == 0) { + /* Needs to read the next buffer - this one is empty: */ + /* coverity[check_return] */ + if (InternalRead(GifFile, Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + /* There shouldn't be any empty data blocks here as the LZW spec + * says the LZW termination code should come first. Therefore + * we shouldn't be inside this routine at that point. + */ + if (Buf[0] == 0) { + GifFile->Error = D_GIF_ERR_IMAGE_DEFECT; + return GIF_ERROR; + } + if (InternalRead(GifFile, &Buf[1], Buf[0]) != Buf[0]) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + *NextByte = Buf[1]; + Buf[1] = 2; /* We use now the second place as last char read! */ + Buf[0]--; + } else { + *NextByte = Buf[Buf[1]++]; + Buf[0]--; + } + + return GIF_OK; +} + +/****************************************************************************** + This routine is called in case of error during parsing image. We need to + decrease image counter and reallocate memory for saved images. Not decreasing + ImageCount may lead to null pointer dereference, because the last element in + SavedImages may point to the spoilt image and null pointer buffers. +*******************************************************************************/ +void DGifDecreaseImageCounter(GifFileType *GifFile) { + GifFile->ImageCount--; + if (GifFile->SavedImages[GifFile->ImageCount].RasterBits != NULL) { + free(GifFile->SavedImages[GifFile->ImageCount].RasterBits); + } + + // Realloc array according to the new image counter. + SavedImage *correct_saved_images = (SavedImage *)reallocarray( + GifFile->SavedImages, GifFile->ImageCount, sizeof(SavedImage)); + if (correct_saved_images != NULL) { + GifFile->SavedImages = correct_saved_images; + } +} + +/****************************************************************************** + This routine reads an entire GIF into core, hanging all its state info off + the GifFileType pointer. Call DGifOpenFileName() or DGifOpenFileHandle() + first to initialize I/O. Its inverse is EGifSpew(). +*******************************************************************************/ +int DGifSlurp(GifFileType *GifFile) { + size_t ImageSize; + GifRecordType RecordType; + SavedImage *sp; + GifByteType *ExtData; + int ExtFunction; + + GifFile->ExtensionBlocks = NULL; + GifFile->ExtensionBlockCount = 0; + + do { + if (DGifGetRecordType(GifFile, &RecordType) == GIF_ERROR) { + return (GIF_ERROR); + } + + switch (RecordType) { + case IMAGE_DESC_RECORD_TYPE: + if (DGifGetImageDesc(GifFile) == GIF_ERROR) { + return (GIF_ERROR); + } + + sp = &GifFile->SavedImages[GifFile->ImageCount - 1]; + /* Allocate memory for the image */ + if (sp->ImageDesc.Width <= 0 || + sp->ImageDesc.Height <= 0 || + sp->ImageDesc.Width > + (INT_MAX / sp->ImageDesc.Height)) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + ImageSize = sp->ImageDesc.Width * sp->ImageDesc.Height; + + if (ImageSize > (SIZE_MAX / sizeof(GifPixelType))) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + sp->RasterBits = (unsigned char *)reallocarray( + NULL, ImageSize, sizeof(GifPixelType)); + + if (sp->RasterBits == NULL) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + + if (sp->ImageDesc.Interlace) { + int i, j; + /* + * The way an interlaced image should be read - + * offsets and jumps... + */ + static const int InterlacedOffset[] = {0, 4, 2, + 1}; + static const int InterlacedJumps[] = {8, 8, 4, + 2}; + /* Need to perform 4 passes on the image */ + for (i = 0; i < 4; i++) { + for (j = InterlacedOffset[i]; + j < sp->ImageDesc.Height; + j += InterlacedJumps[i]) { + if (DGifGetLine( + GifFile, + sp->RasterBits + + j * sp->ImageDesc + .Width, + sp->ImageDesc.Width) == + GIF_ERROR) { + DGifDecreaseImageCounter( + GifFile); + return GIF_ERROR; + } + } + } + } else { + if (DGifGetLine(GifFile, sp->RasterBits, + ImageSize) == GIF_ERROR) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + } + + if (GifFile->ExtensionBlocks) { + sp->ExtensionBlocks = GifFile->ExtensionBlocks; + sp->ExtensionBlockCount = + GifFile->ExtensionBlockCount; + + GifFile->ExtensionBlocks = NULL; + GifFile->ExtensionBlockCount = 0; + } + break; + + case EXTENSION_RECORD_TYPE: + if (DGifGetExtension(GifFile, &ExtFunction, &ExtData) == + GIF_ERROR) { + return (GIF_ERROR); + } + /* Create an extension block with our data */ + if (ExtData != NULL) { + if (GifAddExtensionBlock( + &GifFile->ExtensionBlockCount, + &GifFile->ExtensionBlocks, ExtFunction, + ExtData[0], &ExtData[1]) == GIF_ERROR) { + return (GIF_ERROR); + } + } + for (;;) { + if (DGifGetExtensionNext(GifFile, &ExtData) == + GIF_ERROR) { + return (GIF_ERROR); + } + if (ExtData == NULL) { + break; + } + /* Continue the extension block */ + if (GifAddExtensionBlock( + &GifFile->ExtensionBlockCount, + &GifFile->ExtensionBlocks, + CONTINUE_EXT_FUNC_CODE, ExtData[0], + &ExtData[1]) == GIF_ERROR) { + return (GIF_ERROR); + } + } + break; + + case TERMINATE_RECORD_TYPE: + break; + + default: /* Should be trapped by DGifGetRecordType */ + break; + } + } while (RecordType != TERMINATE_RECORD_TYPE); + + /* Sanity check for corrupted file */ + if (GifFile->ImageCount == 0) { + GifFile->Error = D_GIF_ERR_NO_IMAG_DSCR; + return (GIF_ERROR); + } + + return (GIF_OK); +} + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_hash.c b/torchvision/csrc/io/image/cpu/giflib/gif_hash.c new file mode 100644 index 00000000000..42efbe8de68 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_hash.c @@ -0,0 +1,129 @@ +// @nolint (improperly imported third-party code) +/***************************************************************************** + +gif_hash.c -- module to support the following operations: + +1. InitHashTable - initialize hash table. +2. ClearHashTable - clear the hash table to an empty state. +2. InsertHashTable - insert one item into data structure. +3. ExistsHashTable - test if item exists in data structure. + +This module is used to hash the GIF codes during encoding. + +*****************************************************************************/ +// SPDX-License-Identifier: MIT +// SPDX-File-Copyright-Txt: (C) Copyright 1989 Gershon Elber + +#include +#include +#include +#include +#include + +#include "gif_hash.h" +#include "gif_lib.h" +#include "gif_lib_private.h" + +/* #define DEBUG_HIT_RATE Debug number of misses per hash Insert/Exists. */ + +#ifdef DEBUG_HIT_RATE +static long NumberOfTests = 0, NumberOfMisses = 0; +#endif /* DEBUG_HIT_RATE */ + +static int KeyItem(uint32_t Item); + +/****************************************************************************** + Initialize HashTable - allocate the memory needed and clear it. * +******************************************************************************/ +GifHashTableType *_InitHashTable(void) { + GifHashTableType *HashTable; + + if ((HashTable = (GifHashTableType *)malloc( + sizeof(GifHashTableType))) == NULL) { + return NULL; + } + + _ClearHashTable(HashTable); + + return HashTable; +} + +/****************************************************************************** + Routine to clear the HashTable to an empty state. * + This part is a little machine depended. Use the commented part otherwise. * +******************************************************************************/ +void _ClearHashTable(GifHashTableType *HashTable) { + memset(HashTable->HTable, 0xFF, HT_SIZE * sizeof(uint32_t)); +} + +/****************************************************************************** + Routine to insert a new Item into the HashTable. The data is assumed to be * + new one. * +******************************************************************************/ +void _InsertHashTable(GifHashTableType *HashTable, uint32_t Key, int Code) { + int HKey = KeyItem(Key); + uint32_t *HTable = HashTable->HTable; + +#ifdef DEBUG_HIT_RATE + NumberOfTests++; + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + + while (HT_GET_KEY(HTable[HKey]) != 0xFFFFFL) { +#ifdef DEBUG_HIT_RATE + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + HKey = (HKey + 1) & HT_KEY_MASK; + } + HTable[HKey] = HT_PUT_KEY(Key) | HT_PUT_CODE(Code); +} + +/****************************************************************************** + Routine to test if given Key exists in HashTable and if so returns its code * + Returns the Code if key was found, -1 if not. * +******************************************************************************/ +int _ExistsHashTable(GifHashTableType *HashTable, uint32_t Key) { + int HKey = KeyItem(Key); + uint32_t *HTable = HashTable->HTable, HTKey; + +#ifdef DEBUG_HIT_RATE + NumberOfTests++; + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + + while ((HTKey = HT_GET_KEY(HTable[HKey])) != 0xFFFFFL) { +#ifdef DEBUG_HIT_RATE + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + if (Key == HTKey) { + return HT_GET_CODE(HTable[HKey]); + } + HKey = (HKey + 1) & HT_KEY_MASK; + } + + return -1; +} + +/****************************************************************************** + Routine to generate an HKey for the hashtable out of the given unique key. * + The given Key is assumed to be 20 bits as follows: lower 8 bits are the * + new postfix character, while the upper 12 bits are the prefix code. * + Because the average hit ratio is only 2 (2 hash references per entry), * + evaluating more complex keys (such as twin prime keys) does not worth it! * +******************************************************************************/ +static int KeyItem(uint32_t Item) { + return ((Item >> 12) ^ Item) & HT_KEY_MASK; +} + +#ifdef DEBUG_HIT_RATE +/****************************************************************************** + Debugging routine to print the hit ratio - number of times the hash table * + was tested per operation. This routine was used to test the KeyItem routine * +******************************************************************************/ +void HashTablePrintHitRatio(void) { + printf("Hash Table Hit Ratio is %ld/%ld = %ld%%.\n", NumberOfMisses, + NumberOfTests, NumberOfMisses * 100 / NumberOfTests); +} +#endif /* DEBUG_HIT_RATE */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_hash.h b/torchvision/csrc/io/image/cpu/giflib/gif_hash.h new file mode 100644 index 00000000000..3066fb14592 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_hash.h @@ -0,0 +1,43 @@ +// @nolint (improperly imported third-party code) +/****************************************************************************** + +gif_hash.h - magfic constants and declarations for GIF LZW + +******************************************************************************/ +// SPDX-License-Identifier: MIT + +#ifndef _GIF_HASH_H_ +#define _GIF_HASH_H_ + +#ifndef _WIN32 +#include +#endif /* _WIN32 */ +#include + +#define HT_SIZE 8192 /* 12bits = 4096 or twice as big! */ +#define HT_KEY_MASK 0x1FFF /* 13bits keys */ +#define HT_KEY_NUM_BITS 13 /* 13bits keys */ +#define HT_MAX_KEY 8191 /* 13bits - 1, maximal code possible */ +#define HT_MAX_CODE 4095 /* Biggest code possible in 12 bits. */ + +/* The 32 bits of the long are divided into two parts for the key & code: */ +/* 1. The code is 12 bits as our compression algorithm is limited to 12bits */ +/* 2. The key is 12 bits Prefix code + 8 bit new char or 20 bits. */ +/* The key is the upper 20 bits. The code is the lower 12. */ +#define HT_GET_KEY(l) (l >> 12) +#define HT_GET_CODE(l) (l & 0x0FFF) +#define HT_PUT_KEY(l) (l << 12) +#define HT_PUT_CODE(l) (l & 0x0FFF) + +typedef struct GifHashTableType { + uint32_t HTable[HT_SIZE]; +} GifHashTableType; + +GifHashTableType *_InitHashTable(void); +void _ClearHashTable(GifHashTableType *HashTable); +void _InsertHashTable(GifHashTableType *HashTable, uint32_t Key, int Code); +int _ExistsHashTable(GifHashTableType *HashTable, uint32_t Key); + +#endif /* _GIF_HASH_H_ */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_lib.h b/torchvision/csrc/io/image/cpu/giflib/gif_lib.h new file mode 100644 index 00000000000..7bed0430450 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_lib.h @@ -0,0 +1,292 @@ +// @nolint (improperly imported third-party code) +/****************************************************************************** + +gif_lib.h - service library for decoding and encoding GIF images + +SPDX-License-Identifier: MIT + +*****************************************************************************/ + +#ifndef _GIF_LIB_H_ +#define _GIF_LIB_H_ 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#define GIFLIB_MAJOR 5 +#define GIFLIB_MINOR 2 +#define GIFLIB_RELEASE 2 + +#define GIF_ERROR 0 +#define GIF_OK 1 + +#include +#include + +#define GIF_STAMP "GIFVER" /* First chars in file - GIF stamp. */ +#define GIF_STAMP_LEN sizeof(GIF_STAMP) - 1 +#define GIF_VERSION_POS 3 /* Version first character in stamp. */ +#define GIF87_STAMP "GIF87a" /* First chars in file - GIF stamp. */ +#define GIF89_STAMP "GIF89a" /* First chars in file - GIF stamp. */ + +typedef unsigned char GifPixelType; +typedef unsigned char *GifRowType; +typedef unsigned char GifByteType; +typedef unsigned int GifPrefixType; +typedef int GifWord; + +typedef struct GifColorType { + GifByteType Red, Green, Blue; +} GifColorType; + +typedef struct ColorMapObject { + int ColorCount; + int BitsPerPixel; + bool SortFlag; + GifColorType *Colors; /* on malloc(3) heap */ +} ColorMapObject; + +typedef struct GifImageDesc { + GifWord Left, Top, Width, Height; /* Current image dimensions. */ + bool Interlace; /* Sequential/Interlaced lines. */ + ColorMapObject *ColorMap; /* The local color map */ +} GifImageDesc; + +typedef struct ExtensionBlock { + int ByteCount; + GifByteType *Bytes; /* on malloc(3) heap */ + int Function; /* The block function code */ +#define CONTINUE_EXT_FUNC_CODE 0x00 /* continuation subblock */ +#define COMMENT_EXT_FUNC_CODE 0xfe /* comment */ +#define GRAPHICS_EXT_FUNC_CODE 0xf9 /* graphics control (GIF89) */ +#define PLAINTEXT_EXT_FUNC_CODE 0x01 /* plaintext */ +#define APPLICATION_EXT_FUNC_CODE 0xff /* application block (GIF89) */ +} ExtensionBlock; + +typedef struct SavedImage { + GifImageDesc ImageDesc; + GifByteType *RasterBits; /* on malloc(3) heap */ + int ExtensionBlockCount; /* Count of extensions before image */ + ExtensionBlock *ExtensionBlocks; /* Extensions before image */ +} SavedImage; + +typedef struct GifFileType { + GifWord SWidth, SHeight; /* Size of virtual canvas */ + GifWord SColorResolution; /* How many colors can we generate? */ + GifWord SBackGroundColor; /* Background color for virtual canvas */ + GifByteType AspectByte; /* Used to compute pixel aspect ratio */ + ColorMapObject *SColorMap; /* Global colormap, NULL if nonexistent. */ + int ImageCount; /* Number of current image (both APIs) */ + GifImageDesc Image; /* Current image (low-level API) */ + SavedImage *SavedImages; /* Image sequence (high-level API) */ + int ExtensionBlockCount; /* Count extensions past last image */ + ExtensionBlock *ExtensionBlocks; /* Extensions past last image */ + int Error; /* Last error condition reported */ + void *UserData; /* hook to attach user data (TVT) */ + void *Private; /* Don't mess with this! */ +} GifFileType; + +#define GIF_ASPECT_RATIO(n) ((n) + 15.0 / 64.0) + +typedef enum { + UNDEFINED_RECORD_TYPE, + SCREEN_DESC_RECORD_TYPE, + IMAGE_DESC_RECORD_TYPE, /* Begin with ',' */ + EXTENSION_RECORD_TYPE, /* Begin with '!' */ + TERMINATE_RECORD_TYPE /* Begin with ';' */ +} GifRecordType; + +/* func type to read gif data from arbitrary sources (TVT) */ +typedef int (*InputFunc)(GifFileType *, GifByteType *, int); + +/* func type to write gif data to arbitrary targets. + * Returns count of bytes written. (MRB) + */ +typedef int (*OutputFunc)(GifFileType *, const GifByteType *, int); + +/****************************************************************************** + GIF89 structures +******************************************************************************/ + +typedef struct GraphicsControlBlock { + int DisposalMode; +#define DISPOSAL_UNSPECIFIED 0 /* No disposal specified. */ +#define DISPOSE_DO_NOT 1 /* Leave image in place */ +#define DISPOSE_BACKGROUND 2 /* Set area too background color */ +#define DISPOSE_PREVIOUS 3 /* Restore to previous content */ + bool UserInputFlag; /* User confirmation required before disposal */ + int DelayTime; /* pre-display delay in 0.01sec units */ + int TransparentColor; /* Palette index for transparency, -1 if none */ +#define NO_TRANSPARENT_COLOR -1 +} GraphicsControlBlock; + +/****************************************************************************** + GIF encoding routines +******************************************************************************/ + +/* Main entry points */ +GifFileType *EGifOpenFileName(const char *GifFileName, + const bool GifTestExistence, int *Error); +GifFileType *EGifOpenFileHandle(const int GifFileHandle, int *Error); +GifFileType *EGifOpen(void *userPtr, OutputFunc writeFunc, int *Error); +int EGifSpew(GifFileType *GifFile); +const char *EGifGetGifVersion(GifFileType *GifFile); /* new in 5.x */ +int EGifCloseFile(GifFileType *GifFile, int *ErrorCode); + +#define E_GIF_SUCCEEDED 0 +#define E_GIF_ERR_OPEN_FAILED 1 /* And EGif possible errors. */ +#define E_GIF_ERR_WRITE_FAILED 2 +#define E_GIF_ERR_HAS_SCRN_DSCR 3 +#define E_GIF_ERR_HAS_IMAG_DSCR 4 +#define E_GIF_ERR_NO_COLOR_MAP 5 +#define E_GIF_ERR_DATA_TOO_BIG 6 +#define E_GIF_ERR_NOT_ENOUGH_MEM 7 +#define E_GIF_ERR_DISK_IS_FULL 8 +#define E_GIF_ERR_CLOSE_FAILED 9 +#define E_GIF_ERR_NOT_WRITEABLE 10 + +/* These are legacy. You probably do not want to call them directly */ +int EGifPutScreenDesc(GifFileType *GifFile, const int GifWidth, + const int GifHeight, const int GifColorRes, + const int GifBackGround, + const ColorMapObject *GifColorMap); +int EGifPutImageDesc(GifFileType *GifFile, const int GifLeft, const int GifTop, + const int GifWidth, const int GifHeight, + const bool GifInterlace, + const ColorMapObject *GifColorMap); +void EGifSetGifVersion(GifFileType *GifFile, const bool gif89); +int EGifPutLine(GifFileType *GifFile, GifPixelType *GifLine, int GifLineLen); +int EGifPutPixel(GifFileType *GifFile, const GifPixelType GifPixel); +int EGifPutComment(GifFileType *GifFile, const char *GifComment); +int EGifPutExtensionLeader(GifFileType *GifFile, const int GifExtCode); +int EGifPutExtensionBlock(GifFileType *GifFile, const int GifExtLen, + const void *GifExtension); +int EGifPutExtensionTrailer(GifFileType *GifFile); +int EGifPutExtension(GifFileType *GifFile, const int GifExtCode, + const int GifExtLen, const void *GifExtension); +int EGifPutCode(GifFileType *GifFile, int GifCodeSize, + const GifByteType *GifCodeBlock); +int EGifPutCodeNext(GifFileType *GifFile, const GifByteType *GifCodeBlock); + +/****************************************************************************** + GIF decoding routines +******************************************************************************/ + +/* Main entry points */ +GifFileType *DGifOpenFileName(const char *GifFileName, int *Error); +GifFileType *DGifOpenFileHandle(int GifFileHandle, int *Error); +int DGifSlurp(GifFileType *GifFile); +GifFileType *DGifOpen(void *userPtr, InputFunc readFunc, + int *Error); /* new one (TVT) */ +int DGifCloseFile(GifFileType *GifFile, int *ErrorCode); + +#define D_GIF_SUCCEEDED 0 +#define D_GIF_ERR_OPEN_FAILED 101 /* And DGif possible errors. */ +#define D_GIF_ERR_READ_FAILED 102 +#define D_GIF_ERR_NOT_GIF_FILE 103 +#define D_GIF_ERR_NO_SCRN_DSCR 104 +#define D_GIF_ERR_NO_IMAG_DSCR 105 +#define D_GIF_ERR_NO_COLOR_MAP 106 +#define D_GIF_ERR_WRONG_RECORD 107 +#define D_GIF_ERR_DATA_TOO_BIG 108 +#define D_GIF_ERR_NOT_ENOUGH_MEM 109 +#define D_GIF_ERR_CLOSE_FAILED 110 +#define D_GIF_ERR_NOT_READABLE 111 +#define D_GIF_ERR_IMAGE_DEFECT 112 +#define D_GIF_ERR_EOF_TOO_SOON 113 + +/* These are legacy. You probably do not want to call them directly */ +int DGifGetScreenDesc(GifFileType *GifFile); +int DGifGetRecordType(GifFileType *GifFile, GifRecordType *GifType); +int DGifGetImageHeader(GifFileType *GifFile); +int DGifGetImageDesc(GifFileType *GifFile); +int DGifGetLine(GifFileType *GifFile, GifPixelType *GifLine, int GifLineLen); +int DGifGetPixel(GifFileType *GifFile, GifPixelType GifPixel); +int DGifGetExtension(GifFileType *GifFile, int *GifExtCode, + GifByteType **GifExtension); +int DGifGetExtensionNext(GifFileType *GifFile, GifByteType **GifExtension); +int DGifGetCode(GifFileType *GifFile, int *GifCodeSize, + GifByteType **GifCodeBlock); +int DGifGetCodeNext(GifFileType *GifFile, GifByteType **GifCodeBlock); +int DGifGetLZCodes(GifFileType *GifFile, int *GifCode); +const char *DGifGetGifVersion(GifFileType *GifFile); + +/****************************************************************************** + Error handling and reporting. +******************************************************************************/ +extern const char *GifErrorString(int ErrorCode); /* new in 2012 - ESR */ + +/***************************************************************************** + it g in core. +******************************************************************************/ + +/****************************************************************************** + Color map handling from gif_alloc.c +******************************************************************************/ + +extern ColorMapObject *GifMakeMapObject(int ColorCount, + const GifColorType *ColorMap); +extern void GifFreeMapObject(ColorMapObject *Object); +extern ColorMapObject *GifUnionColorMap(const ColorMapObject *ColorIn1, + const ColorMapObject *ColorIn2, + GifPixelType ColorTransIn2[]); +extern int GifBitSize(int n); + +/****************************************************************************** + Support for the in-core structures allocation (slurp mode). +******************************************************************************/ + +extern void GifApplyTranslation(SavedImage *Image, + const GifPixelType Translation[]); +extern int GifAddExtensionBlock(int *ExtensionBlock_Count, + ExtensionBlock **ExtensionBlocks, int Function, + unsigned int Len, unsigned char ExtData[]); +extern void GifFreeExtensions(int *ExtensionBlock_Count, + ExtensionBlock **ExtensionBlocks); +extern SavedImage *GifMakeSavedImage(GifFileType *GifFile, + const SavedImage *CopyFrom); +extern void GifFreeSavedImages(GifFileType *GifFile); + +/****************************************************************************** + 5.x functions for GIF89 graphics control blocks +******************************************************************************/ + +int DGifExtensionToGCB(const size_t GifExtensionLength, + const GifByteType *GifExtension, + GraphicsControlBlock *GCB); +size_t EGifGCBToExtension(const GraphicsControlBlock *GCB, + GifByteType *GifExtension); + +int DGifSavedExtensionToGCB(GifFileType *GifFile, int ImageIndex, + GraphicsControlBlock *GCB); +int EGifGCBToSavedExtension(const GraphicsControlBlock *GCB, + GifFileType *GifFile, int ImageIndex); + +/****************************************************************************** + The library's internal utility font +******************************************************************************/ + +#define GIF_FONT_WIDTH 8 +#define GIF_FONT_HEIGHT 8 +extern const unsigned char GifAsciiTable8x8[][GIF_FONT_WIDTH]; + +extern void GifDrawText8x8(SavedImage *Image, const int x, const int y, + const char *legend, const int color); + +extern void GifDrawBox(SavedImage *Image, const int x, const int y, const int w, + const int d, const int color); + +extern void GifDrawRectangle(SavedImage *Image, const int x, const int y, + const int w, const int d, const int color); + +extern void GifDrawBoxedText8x8(SavedImage *Image, const int x, const int y, + const char *legend, const int border, + const int bg, const int fg); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* _GIF_LIB_H */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h b/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h new file mode 100644 index 00000000000..04987150321 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h @@ -0,0 +1,73 @@ +// @nolint (improperly imported third-party code) +/**************************************************************************** + +gif_lib_private.h - internal giflib routines and structures + +SPDX-License-Identifier: MIT + +****************************************************************************/ + +#ifndef _GIF_LIB_PRIVATE_H +#define _GIF_LIB_PRIVATE_H + +#include "gif_hash.h" +#include "gif_lib.h" + +#ifndef SIZE_MAX +#define SIZE_MAX UINTPTR_MAX +#endif + +#define EXTENSION_INTRODUCER 0x21 +#define DESCRIPTOR_INTRODUCER 0x2c +#define TERMINATOR_INTRODUCER 0x3b + +#define LZ_MAX_CODE 4095 /* Biggest code possible in 12 bits. */ +#define LZ_BITS 12 + +#define FLUSH_OUTPUT 4096 /* Impossible code, to signal flush. */ +#define FIRST_CODE 4097 /* Impossible code, to signal first. */ +#define NO_SUCH_CODE 4098 /* Impossible code, to signal empty. */ + +#define FILE_STATE_WRITE 0x01 +#define FILE_STATE_SCREEN 0x02 +#define FILE_STATE_IMAGE 0x04 +#define FILE_STATE_READ 0x08 + +#define IS_READABLE(Private) (Private->FileState & FILE_STATE_READ) +#define IS_WRITEABLE(Private) (Private->FileState & FILE_STATE_WRITE) + +typedef struct GifFilePrivateType { + GifWord FileState, FileHandle, /* Where all this data goes to! */ + BitsPerPixel, /* Bits per pixel (Codes uses at least this + 1). */ + ClearCode, /* The CLEAR LZ code. */ + EOFCode, /* The EOF LZ code. */ + RunningCode, /* The next code algorithm can generate. */ + RunningBits, /* The number of bits required to represent + RunningCode. */ + MaxCode1, /* 1 bigger than max. possible code, in RunningBits bits. + */ + LastCode, /* The code before the current code. */ + CrntCode, /* Current algorithm code. */ + StackPtr, /* For character stack (see below). */ + CrntShiftState; /* Number of bits in CrntShiftDWord. */ + unsigned long CrntShiftDWord; /* For bytes decomposition into codes. */ + unsigned long PixelCount; /* Number of pixels in image. */ + FILE *File; /* File as stream. */ + InputFunc Read; /* function to read gif input (TVT) */ + OutputFunc Write; /* function to write gif output (MRB) */ + GifByteType Buf[256]; /* Compressed input is buffered here. */ + GifByteType Stack[LZ_MAX_CODE]; /* Decoded pixels are stacked here. */ + GifByteType Suffix[LZ_MAX_CODE + 1]; /* So we can trace the codes. */ + GifPrefixType Prefix[LZ_MAX_CODE + 1]; + GifHashTableType *HashTable; + bool gif89; +} GifFilePrivateType; + +#ifndef HAVE_REALLOCARRAY +extern void *openbsd_reallocarray(void *optr, size_t nmemb, size_t size); +#define reallocarray openbsd_reallocarray +#endif + +#endif /* _GIF_LIB_PRIVATE_H */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gifalloc.c b/torchvision/csrc/io/image/cpu/giflib/gifalloc.c new file mode 100644 index 00000000000..65679d22804 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gifalloc.c @@ -0,0 +1,426 @@ +// @nolint (improperly imported third-party code) +/***************************************************************************** + + GIF construction tools + +****************************************************************************/ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright (C) Eric S. Raymond + +#include +#include +#include + +#include "gif_lib.h" +#include "gif_lib_private.h" + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +/****************************************************************************** + Miscellaneous utility functions +******************************************************************************/ + +/* return smallest bitfield size n will fit in */ +int GifBitSize(int n) { + int i; + + for (i = 1; i <= 8; i++) { + if ((1 << i) >= n) { + break; + } + } + return (i); +} + +/****************************************************************************** + Color map object functions +******************************************************************************/ + +/* + * Allocate a color map of given size; initialize with contents of + * ColorMap if that pointer is non-NULL. + */ +ColorMapObject *GifMakeMapObject(int ColorCount, const GifColorType *ColorMap) { + ColorMapObject *Object; + + /*** FIXME: Our ColorCount has to be a power of two. Is it necessary to + * make the user know that or should we automatically round up instead? + */ + if (ColorCount != (1 << GifBitSize(ColorCount))) { + return ((ColorMapObject *)NULL); + } + + Object = (ColorMapObject *)malloc(sizeof(ColorMapObject)); + if (Object == (ColorMapObject *)NULL) { + return ((ColorMapObject *)NULL); + } + + Object->Colors = + (GifColorType *)calloc(ColorCount, sizeof(GifColorType)); + if (Object->Colors == (GifColorType *)NULL) { + free(Object); + return ((ColorMapObject *)NULL); + } + + Object->ColorCount = ColorCount; + Object->BitsPerPixel = GifBitSize(ColorCount); + Object->SortFlag = false; + + if (ColorMap != NULL) { + memcpy((char *)Object->Colors, (char *)ColorMap, + ColorCount * sizeof(GifColorType)); + } + + return (Object); +} + +/******************************************************************************* + Free a color map object +*******************************************************************************/ +void GifFreeMapObject(ColorMapObject *Object) { + if (Object != NULL) { + (void)free(Object->Colors); + (void)free(Object); + } +} + +#ifdef DEBUG +void DumpColorMap(ColorMapObject *Object, FILE *fp) { + if (Object != NULL) { + int i, j, Len = Object->ColorCount; + + for (i = 0; i < Len; i += 4) { + for (j = 0; j < 4 && j < Len; j++) { + (void)fprintf(fp, "%3d: %02x %02x %02x ", + i + j, Object->Colors[i + j].Red, + Object->Colors[i + j].Green, + Object->Colors[i + j].Blue); + } + (void)fprintf(fp, "\n"); + } + } +} +#endif /* DEBUG */ + +/******************************************************************************* + Compute the union of two given color maps and return it. If result can't + fit into 256 colors, NULL is returned, the allocated union otherwise. + ColorIn1 is copied as is to ColorUnion, while colors from ColorIn2 are + copied iff they didn't exist before. ColorTransIn2 maps the old + ColorIn2 into the ColorUnion color map table./ +*******************************************************************************/ +ColorMapObject *GifUnionColorMap(const ColorMapObject *ColorIn1, + const ColorMapObject *ColorIn2, + GifPixelType ColorTransIn2[]) { + int i, j, CrntSlot, RoundUpTo, NewGifBitSize; + ColorMapObject *ColorUnion; + + /* + * We don't worry about duplicates within either color map; if + * the caller wants to resolve those, he can perform unions + * with an empty color map. + */ + + /* Allocate table which will hold the result for sure. */ + ColorUnion = GifMakeMapObject( + MAX(ColorIn1->ColorCount, ColorIn2->ColorCount) * 2, NULL); + + if (ColorUnion == NULL) { + return (NULL); + } + + /* + * Copy ColorIn1 to ColorUnion. + */ + for (i = 0; i < ColorIn1->ColorCount; i++) { + ColorUnion->Colors[i] = ColorIn1->Colors[i]; + } + CrntSlot = ColorIn1->ColorCount; + + /* + * Potentially obnoxious hack: + * + * Back CrntSlot down past all contiguous {0, 0, 0} slots at the end + * of table 1. This is very useful if your display is limited to + * 16 colors. + */ + while (ColorIn1->Colors[CrntSlot - 1].Red == 0 && + ColorIn1->Colors[CrntSlot - 1].Green == 0 && + ColorIn1->Colors[CrntSlot - 1].Blue == 0) { + CrntSlot--; + } + + /* Copy ColorIn2 to ColorUnion (use old colors if they exist): */ + for (i = 0; i < ColorIn2->ColorCount && CrntSlot <= 256; i++) { + /* Let's see if this color already exists: */ + for (j = 0; j < ColorIn1->ColorCount; j++) { + if (memcmp(&ColorIn1->Colors[j], &ColorIn2->Colors[i], + sizeof(GifColorType)) == 0) { + break; + } + } + + if (j < ColorIn1->ColorCount) { + ColorTransIn2[i] = j; /* color exists in Color1 */ + } else { + /* Color is new - copy it to a new slot: */ + ColorUnion->Colors[CrntSlot] = ColorIn2->Colors[i]; + ColorTransIn2[i] = CrntSlot++; + } + } + + if (CrntSlot > 256) { + GifFreeMapObject(ColorUnion); + return ((ColorMapObject *)NULL); + } + + NewGifBitSize = GifBitSize(CrntSlot); + RoundUpTo = (1 << NewGifBitSize); + + if (RoundUpTo != ColorUnion->ColorCount) { + GifColorType *Map = ColorUnion->Colors; + + /* + * Zero out slots up to next power of 2. + * We know these slots exist because of the way ColorUnion's + * start dimension was computed. + */ + for (j = CrntSlot; j < RoundUpTo; j++) { + Map[j].Red = Map[j].Green = Map[j].Blue = 0; + } + + /* perhaps we can shrink the map? */ + if (RoundUpTo < ColorUnion->ColorCount) { + GifColorType *new_map = (GifColorType *)reallocarray( + Map, RoundUpTo, sizeof(GifColorType)); + if (new_map == NULL) { + GifFreeMapObject(ColorUnion); + return ((ColorMapObject *)NULL); + } + ColorUnion->Colors = new_map; + } + } + + ColorUnion->ColorCount = RoundUpTo; + ColorUnion->BitsPerPixel = NewGifBitSize; + + return (ColorUnion); +} + +/******************************************************************************* + Apply a given color translation to the raster bits of an image +*******************************************************************************/ +void GifApplyTranslation(SavedImage *Image, const GifPixelType Translation[]) { + int i; + int RasterSize = + Image->ImageDesc.Height * Image->ImageDesc.Width; + + for (i = 0; i < RasterSize; i++) { + Image->RasterBits[i] = Translation[Image->RasterBits[i]]; + } +} + +/****************************************************************************** + Extension record functions +******************************************************************************/ +int GifAddExtensionBlock(int *ExtensionBlockCount, + ExtensionBlock **ExtensionBlocks, int Function, + unsigned int Len, unsigned char ExtData[]) { + ExtensionBlock *ep; + + if (*ExtensionBlocks == NULL) { + *ExtensionBlocks = + (ExtensionBlock *)malloc(sizeof(ExtensionBlock)); + } else { + ExtensionBlock *ep_new = (ExtensionBlock *)reallocarray( + *ExtensionBlocks, (*ExtensionBlockCount + 1), + sizeof(ExtensionBlock)); + if (ep_new == NULL) { + return (GIF_ERROR); + } + *ExtensionBlocks = ep_new; + } + + if (*ExtensionBlocks == NULL) { + return (GIF_ERROR); + } + + ep = &(*ExtensionBlocks)[(*ExtensionBlockCount)++]; + + ep->Function = Function; + ep->ByteCount = Len; + ep->Bytes = (GifByteType *)malloc(ep->ByteCount); + if (ep->Bytes == NULL) { + return (GIF_ERROR); + } + + if (ExtData != NULL) { + memcpy(ep->Bytes, ExtData, Len); + } + + return (GIF_OK); +} + +void GifFreeExtensions(int *ExtensionBlockCount, + ExtensionBlock **ExtensionBlocks) { + ExtensionBlock *ep; + + if (*ExtensionBlocks == NULL) { + return; + } + + for (ep = *ExtensionBlocks; + ep < (*ExtensionBlocks + *ExtensionBlockCount); ep++) { + (void)free((char *)ep->Bytes); + } + (void)free((char *)*ExtensionBlocks); + *ExtensionBlocks = NULL; + *ExtensionBlockCount = 0; +} + +/****************************************************************************** + Image block allocation functions +******************************************************************************/ + +/* Private Function: + * Frees the last image in the GifFile->SavedImages array + */ +void FreeLastSavedImage(GifFileType *GifFile) { + SavedImage *sp; + + if ((GifFile == NULL) || (GifFile->SavedImages == NULL)) { + return; + } + + /* Remove one SavedImage from the GifFile */ + GifFile->ImageCount--; + sp = &GifFile->SavedImages[GifFile->ImageCount]; + + /* Deallocate its Colormap */ + if (sp->ImageDesc.ColorMap != NULL) { + GifFreeMapObject(sp->ImageDesc.ColorMap); + sp->ImageDesc.ColorMap = NULL; + } + + /* Deallocate the image data */ + if (sp->RasterBits != NULL) { + free((char *)sp->RasterBits); + } + + /* Deallocate any extensions */ + GifFreeExtensions(&sp->ExtensionBlockCount, &sp->ExtensionBlocks); + + /*** FIXME: We could realloc the GifFile->SavedImages structure but is + * there a point to it? Saves some memory but we'd have to do it every + * time. If this is used in GifFreeSavedImages then it would be + * inefficient (The whole array is going to be deallocated.) If we just + * use it when we want to free the last Image it's convenient to do it + * here. + */ +} + +/* + * Append an image block to the SavedImages array + */ +SavedImage *GifMakeSavedImage(GifFileType *GifFile, + const SavedImage *CopyFrom) { + // cppcheck-suppress ctunullpointer + if (GifFile->SavedImages == NULL) { + GifFile->SavedImages = (SavedImage *)malloc(sizeof(SavedImage)); + } else { + SavedImage *newSavedImages = (SavedImage *)reallocarray( + GifFile->SavedImages, (GifFile->ImageCount + 1), + sizeof(SavedImage)); + if (newSavedImages == NULL) { + return ((SavedImage *)NULL); + } + GifFile->SavedImages = newSavedImages; + } + if (GifFile->SavedImages == NULL) { + return ((SavedImage *)NULL); + } else { + SavedImage *sp = &GifFile->SavedImages[GifFile->ImageCount++]; + + if (CopyFrom != NULL) { + memcpy((char *)sp, CopyFrom, sizeof(SavedImage)); + + /* + * Make our own allocated copies of the heap fields in + * the copied record. This guards against potential + * aliasing problems. + */ + + /* first, the local color map */ + if (CopyFrom->ImageDesc.ColorMap != NULL) { + sp->ImageDesc.ColorMap = GifMakeMapObject( + CopyFrom->ImageDesc.ColorMap->ColorCount, + CopyFrom->ImageDesc.ColorMap->Colors); + if (sp->ImageDesc.ColorMap == NULL) { + FreeLastSavedImage(GifFile); + return (SavedImage *)(NULL); + } + } + + /* next, the raster */ + sp->RasterBits = (unsigned char *)reallocarray( + NULL, + (CopyFrom->ImageDesc.Height * + CopyFrom->ImageDesc.Width), + sizeof(GifPixelType)); + if (sp->RasterBits == NULL) { + FreeLastSavedImage(GifFile); + return (SavedImage *)(NULL); + } + memcpy(sp->RasterBits, CopyFrom->RasterBits, + sizeof(GifPixelType) * + CopyFrom->ImageDesc.Height * + CopyFrom->ImageDesc.Width); + + /* finally, the extension blocks */ + if (CopyFrom->ExtensionBlocks != NULL) { + sp->ExtensionBlocks = + (ExtensionBlock *)reallocarray( + NULL, CopyFrom->ExtensionBlockCount, + sizeof(ExtensionBlock)); + if (sp->ExtensionBlocks == NULL) { + FreeLastSavedImage(GifFile); + return (SavedImage *)(NULL); + } + memcpy(sp->ExtensionBlocks, + CopyFrom->ExtensionBlocks, + sizeof(ExtensionBlock) * + CopyFrom->ExtensionBlockCount); + } + } else { + memset((char *)sp, '\0', sizeof(SavedImage)); + } + + return (sp); + } +} + +void GifFreeSavedImages(GifFileType *GifFile) { + SavedImage *sp; + + if ((GifFile == NULL) || (GifFile->SavedImages == NULL)) { + return; + } + for (sp = GifFile->SavedImages; + sp < GifFile->SavedImages + GifFile->ImageCount; sp++) { + if (sp->ImageDesc.ColorMap != NULL) { + GifFreeMapObject(sp->ImageDesc.ColorMap); + sp->ImageDesc.ColorMap = NULL; + } + + if (sp->RasterBits != NULL) { + free((char *)sp->RasterBits); + } + + GifFreeExtensions(&sp->ExtensionBlockCount, + &sp->ExtensionBlocks); + } + free((char *)GifFile->SavedImages); + GifFile->SavedImages = NULL; +} + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c b/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c new file mode 100644 index 00000000000..7d5f1e73a7d --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c @@ -0,0 +1,74 @@ +// @nolint (improperly imported third-party code) +/* + * SPDX-FileCopyrightText: Copyright (C) 2008 Otto Moerbeek + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifndef SIZE_MAX +#define SIZE_MAX UINTPTR_MAX +#endif + +/* + * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX + * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW + */ +#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4)) + +void *openbsd_reallocarray(void *optr, size_t nmemb, size_t size) { + if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && + nmemb > 0 && SIZE_MAX / nmemb < size) { + errno = ENOMEM; + return NULL; + } + /* + * Head off variations in realloc behavior on different + * platforms (reported by MarkR ) + * + * The behaviour of reallocarray is implementation-defined if + * nmemb or size is zero. It can return NULL or non-NULL + * depending on the platform. + * https://www.securecoding.cert.org/confluence/display/c/MEM04-C.Beware+of+zero-lengthallocations + * + * Here are some extracts from realloc man pages on different platforms. + * + * void realloc( void memblock, size_t size ); + * + * Windows: + * + * If there is not enough available memory to expand the block + * to the given size, the original block is left unchanged, + * and NULL is returned. If size is zero, then the block + * pointed to by memblock is freed; the return value is NULL, + * and memblock is left pointing at a freed block. + * + * OpenBSD: + * + * If size or nmemb is equal to 0, a unique pointer to an + * access protected, zero sized object is returned. Access via + * this pointer will generate a SIGSEGV exception. + * + * Linux: + * + * If size was equal to 0, either NULL or a pointer suitable + * to be passed to free() is returned. + * + * OS X: + * + * If size is zero and ptr is not NULL, a new, minimum sized + * object is allocated and the original object is freed. + * + * It looks like images with zero width or height can trigger + * this, and fuzzing behaviour will differ by platform, so + * fuzzing on one platform may not detect zero-size allocation + * problems on other platforms. + */ + if (size == 0 || nmemb == 0) { + return NULL; + } + return realloc(optr, size * nmemb); +} diff --git a/torchvision/csrc/io/image/cpu/read_write_file.cpp b/torchvision/csrc/io/image/cpu/read_write_file.cpp index def74c6721a..06de72a5053 100644 --- a/torchvision/csrc/io/image/cpu/read_write_file.cpp +++ b/torchvision/csrc/io/image/cpu/read_write_file.cpp @@ -17,7 +17,7 @@ std::wstring utf8_decode(const std::string& str) { return std::wstring(); } int size_needed = MultiByteToWideChar( - CP_UTF8, 0, str.c_str(), static_cast(str.size()), NULL, 0); + CP_UTF8, 0, str.c_str(), static_cast(str.size()), nullptr, 0); TORCH_CHECK(size_needed > 0, "Error converting the content to Unicode"); std::wstring wstrTo(size_needed, 0); MultiByteToWideChar( diff --git a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp b/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp deleted file mode 100644 index ee7d432f30d..00000000000 --- a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.cpp +++ /dev/null @@ -1,208 +0,0 @@ -#include "decode_jpeg_cuda.h" - -#include - -#if NVJPEG_FOUND -#include -#include -#include -#endif - -#include - -namespace vision { -namespace image { - -#if !NVJPEG_FOUND - -torch::Tensor decode_jpeg_cuda( - const torch::Tensor& data, - ImageReadMode mode, - torch::Device device) { - TORCH_CHECK( - false, "decode_jpeg_cuda: torchvision not compiled with nvJPEG support"); -} - -#else - -namespace { -static nvjpegHandle_t nvjpeg_handle = nullptr; -} - -torch::Tensor decode_jpeg_cuda( - const torch::Tensor& data, - ImageReadMode mode, - torch::Device device) { - C10_LOG_API_USAGE_ONCE( - "torchvision.csrc.io.image.cuda.decode_jpeg_cuda.decode_jpeg_cuda"); - TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor"); - - TORCH_CHECK( - !data.is_cuda(), - "The input tensor must be on CPU when decoding with nvjpeg") - - TORCH_CHECK( - data.dim() == 1 && data.numel() > 0, - "Expected a non empty 1-dimensional tensor"); - - TORCH_CHECK(device.is_cuda(), "Expected a cuda device") - - int major_version; - int minor_version; - nvjpegStatus_t get_major_property_status = - nvjpegGetProperty(MAJOR_VERSION, &major_version); - nvjpegStatus_t get_minor_property_status = - nvjpegGetProperty(MINOR_VERSION, &minor_version); - - TORCH_CHECK( - get_major_property_status == NVJPEG_STATUS_SUCCESS, - "nvjpegGetProperty failed: ", - get_major_property_status); - TORCH_CHECK( - get_minor_property_status == NVJPEG_STATUS_SUCCESS, - "nvjpegGetProperty failed: ", - get_minor_property_status); - if ((major_version < 11) || ((major_version == 11) && (minor_version < 6))) { - TORCH_WARN_ONCE( - "There is a memory leak issue in the nvjpeg library for CUDA versions < 11.6. " - "Make sure to rely on CUDA 11.6 or above before using decode_jpeg(..., device='cuda')."); - } - - at::cuda::CUDAGuard device_guard(device); - - // Create global nvJPEG handle - static std::once_flag nvjpeg_handle_creation_flag; - std::call_once(nvjpeg_handle_creation_flag, []() { - if (nvjpeg_handle == nullptr) { - nvjpegStatus_t create_status = nvjpegCreateSimple(&nvjpeg_handle); - - if (create_status != NVJPEG_STATUS_SUCCESS) { - // Reset handle so that one can still call the function again in the - // same process if there was a failure - free(nvjpeg_handle); - nvjpeg_handle = nullptr; - } - TORCH_CHECK( - create_status == NVJPEG_STATUS_SUCCESS, - "nvjpegCreateSimple failed: ", - create_status); - } - }); - - // Create the jpeg state - nvjpegJpegState_t jpeg_state; - nvjpegStatus_t state_status = - nvjpegJpegStateCreate(nvjpeg_handle, &jpeg_state); - - TORCH_CHECK( - state_status == NVJPEG_STATUS_SUCCESS, - "nvjpegJpegStateCreate failed: ", - state_status); - - auto datap = data.data_ptr(); - - // Get the image information - int num_channels; - nvjpegChromaSubsampling_t subsampling; - int widths[NVJPEG_MAX_COMPONENT]; - int heights[NVJPEG_MAX_COMPONENT]; - nvjpegStatus_t info_status = nvjpegGetImageInfo( - nvjpeg_handle, - datap, - data.numel(), - &num_channels, - &subsampling, - widths, - heights); - - if (info_status != NVJPEG_STATUS_SUCCESS) { - nvjpegJpegStateDestroy(jpeg_state); - TORCH_CHECK(false, "nvjpegGetImageInfo failed: ", info_status); - } - - if (subsampling == NVJPEG_CSS_UNKNOWN) { - nvjpegJpegStateDestroy(jpeg_state); - TORCH_CHECK(false, "Unknown NVJPEG chroma subsampling"); - } - - int width = widths[0]; - int height = heights[0]; - - nvjpegOutputFormat_t ouput_format; - int num_channels_output; - - switch (mode) { - case IMAGE_READ_MODE_UNCHANGED: - num_channels_output = num_channels; - // For some reason, setting output_format to NVJPEG_OUTPUT_UNCHANGED will - // not properly decode RGB images (it's fine for grayscale), so we set - // output_format manually here - if (num_channels == 1) { - ouput_format = NVJPEG_OUTPUT_Y; - } else if (num_channels == 3) { - ouput_format = NVJPEG_OUTPUT_RGB; - } else { - nvjpegJpegStateDestroy(jpeg_state); - TORCH_CHECK( - false, - "When mode is UNCHANGED, only 1 or 3 input channels are allowed."); - } - break; - case IMAGE_READ_MODE_GRAY: - ouput_format = NVJPEG_OUTPUT_Y; - num_channels_output = 1; - break; - case IMAGE_READ_MODE_RGB: - ouput_format = NVJPEG_OUTPUT_RGB; - num_channels_output = 3; - break; - default: - nvjpegJpegStateDestroy(jpeg_state); - TORCH_CHECK( - false, "The provided mode is not supported for JPEG decoding on GPU"); - } - - auto out_tensor = torch::empty( - {int64_t(num_channels_output), int64_t(height), int64_t(width)}, - torch::dtype(torch::kU8).device(device)); - - // nvjpegImage_t is a struct with - // - an array of pointers to each channel - // - the pitch for each channel - // which must be filled in manually - nvjpegImage_t out_image; - - for (int c = 0; c < num_channels_output; c++) { - out_image.channel[c] = out_tensor[c].data_ptr(); - out_image.pitch[c] = width; - } - for (int c = num_channels_output; c < NVJPEG_MAX_COMPONENT; c++) { - out_image.channel[c] = nullptr; - out_image.pitch[c] = 0; - } - - cudaStream_t stream = at::cuda::getCurrentCUDAStream(device.index()); - - nvjpegStatus_t decode_status = nvjpegDecode( - nvjpeg_handle, - jpeg_state, - datap, - data.numel(), - ouput_format, - &out_image, - stream); - - nvjpegJpegStateDestroy(jpeg_state); - - TORCH_CHECK( - decode_status == NVJPEG_STATUS_SUCCESS, - "nvjpegDecode failed: ", - decode_status); - - return out_tensor; -} - -#endif // NVJPEG_FOUND - -} // namespace image -} // namespace vision diff --git a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h b/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h deleted file mode 100644 index 496b355e9b7..00000000000 --- a/torchvision/csrc/io/image/cuda/decode_jpeg_cuda.h +++ /dev/null @@ -1,15 +0,0 @@ -#pragma once - -#include -#include "../image_read_mode.h" - -namespace vision { -namespace image { - -C10_EXPORT torch::Tensor decode_jpeg_cuda( - const torch::Tensor& data, - ImageReadMode mode, - torch::Device device); - -} // namespace image -} // namespace vision diff --git a/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp new file mode 100644 index 00000000000..85aa6c760c1 --- /dev/null +++ b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.cpp @@ -0,0 +1,602 @@ +#include "decode_jpegs_cuda.h" +#if !NVJPEG_FOUND +namespace vision { +namespace image { +std::vector decode_jpegs_cuda( + const std::vector& encoded_images, + vision::image::ImageReadMode mode, + torch::Device device) { + TORCH_CHECK( + false, "decode_jpegs_cuda: torchvision not compiled with nvJPEG support"); +} +} // namespace image +} // namespace vision + +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace vision { +namespace image { + +std::mutex decoderMutex; +std::unique_ptr cudaJpegDecoder; + +std::vector decode_jpegs_cuda( + const std::vector& encoded_images, + vision::image::ImageReadMode mode, + torch::Device device) { + C10_LOG_API_USAGE_ONCE( + "torchvision.csrc.io.image.cuda.decode_jpegs_cuda.decode_jpegs_cuda"); + + std::lock_guard lock(decoderMutex); + std::vector contig_images; + contig_images.reserve(encoded_images.size()); + + TORCH_CHECK( + device.is_cuda(), "Expected the device parameter to be a cuda device"); + + for (auto& encoded_image : encoded_images) { + TORCH_CHECK( + encoded_image.dtype() == torch::kU8, "Expected a torch.uint8 tensor"); + + TORCH_CHECK( + !encoded_image.is_cuda(), + "The input tensor must be on CPU when decoding with nvjpeg") + + TORCH_CHECK( + encoded_image.dim() == 1 && encoded_image.numel() > 0, + "Expected a non empty 1-dimensional tensor"); + + // nvjpeg requires images to be contiguous + if (encoded_image.is_contiguous()) { + contig_images.push_back(encoded_image); + } else { + contig_images.push_back(encoded_image.contiguous()); + } + } + + int major_version; + int minor_version; + nvjpegStatus_t get_major_property_status = + nvjpegGetProperty(MAJOR_VERSION, &major_version); + nvjpegStatus_t get_minor_property_status = + nvjpegGetProperty(MINOR_VERSION, &minor_version); + + TORCH_CHECK( + get_major_property_status == NVJPEG_STATUS_SUCCESS, + "nvjpegGetProperty failed: ", + get_major_property_status); + TORCH_CHECK( + get_minor_property_status == NVJPEG_STATUS_SUCCESS, + "nvjpegGetProperty failed: ", + get_minor_property_status); + if ((major_version < 11) || ((major_version == 11) && (minor_version < 6))) { + TORCH_WARN_ONCE( + "There is a memory leak issue in the nvjpeg library for CUDA versions < 11.6. " + "Make sure to rely on CUDA 11.6 or above before using decode_jpeg(..., device='cuda')."); + } + + at::cuda::CUDAGuard device_guard(device); + + if (cudaJpegDecoder == nullptr || device != cudaJpegDecoder->target_device) { + if (cudaJpegDecoder != nullptr) { + cudaJpegDecoder.reset(new CUDAJpegDecoder(device)); + } else { + cudaJpegDecoder = std::make_unique(device); + std::atexit([]() { cudaJpegDecoder.reset(); }); + } + } + + nvjpegOutputFormat_t output_format; + + switch (mode) { + case vision::image::IMAGE_READ_MODE_UNCHANGED: + // Using NVJPEG_OUTPUT_UNCHANGED causes differently sized output channels + // which is related to the subsampling used I'm not sure why this is the + // case, but for now we're just using RGB and later removing channels from + // grayscale images. + output_format = NVJPEG_OUTPUT_UNCHANGED; + break; + case vision::image::IMAGE_READ_MODE_GRAY: + output_format = NVJPEG_OUTPUT_Y; + break; + case vision::image::IMAGE_READ_MODE_RGB: + output_format = NVJPEG_OUTPUT_RGB; + break; + default: + TORCH_CHECK( + false, "The provided mode is not supported for JPEG decoding on GPU"); + } + + try { + at::cuda::CUDAEvent event; + auto result = cudaJpegDecoder->decode_images(contig_images, output_format); + auto current_stream{ + device.has_index() ? at::cuda::getCurrentCUDAStream( + cudaJpegDecoder->original_device.index()) + : at::cuda::getCurrentCUDAStream()}; + event.record(cudaJpegDecoder->stream); + event.block(current_stream); + return result; + } catch (const std::exception& e) { + if (typeid(e) != typeid(std::runtime_error)) { + TORCH_CHECK(false, "Error while decoding JPEG images: ", e.what()); + } else { + throw; + } + } +} + +CUDAJpegDecoder::CUDAJpegDecoder(const torch::Device& target_device) + : original_device{torch::kCUDA, c10::cuda::current_device()}, + target_device{target_device}, + stream{ + target_device.has_index() + ? at::cuda::getStreamFromPool(false, target_device.index()) + : at::cuda::getStreamFromPool(false)} { + nvjpegStatus_t status; + + hw_decode_available = true; + status = nvjpegCreateEx( + NVJPEG_BACKEND_HARDWARE, + NULL, + NULL, + NVJPEG_FLAGS_DEFAULT, + &nvjpeg_handle); + if (status == NVJPEG_STATUS_ARCH_MISMATCH) { + status = nvjpegCreateEx( + NVJPEG_BACKEND_DEFAULT, + NULL, + NULL, + NVJPEG_FLAGS_DEFAULT, + &nvjpeg_handle); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to initialize nvjpeg with default backend: ", + status); + hw_decode_available = false; + } else { + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to initialize nvjpeg with hardware backend: ", + status); + } + + status = nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create nvjpeg state: ", + status); + + status = nvjpegDecoderCreate( + nvjpeg_handle, NVJPEG_BACKEND_DEFAULT, &nvjpeg_decoder); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create nvjpeg decoder: ", + status); + + status = nvjpegDecoderStateCreate( + nvjpeg_handle, nvjpeg_decoder, &nvjpeg_decoupled_state); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create nvjpeg decoder state: ", + status); + + status = nvjpegBufferPinnedCreate(nvjpeg_handle, NULL, &pinned_buffers[0]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create pinned buffer: ", + status); + + status = nvjpegBufferPinnedCreate(nvjpeg_handle, NULL, &pinned_buffers[1]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create pinned buffer: ", + status); + + status = nvjpegBufferDeviceCreate(nvjpeg_handle, NULL, &device_buffer); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create device buffer: ", + status); + + status = nvjpegJpegStreamCreate(nvjpeg_handle, &jpeg_streams[0]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create jpeg stream: ", + status); + + status = nvjpegJpegStreamCreate(nvjpeg_handle, &jpeg_streams[1]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create jpeg stream: ", + status); + + status = nvjpegDecodeParamsCreate(nvjpeg_handle, &nvjpeg_decode_params); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create decode params: ", + status); +} + +CUDAJpegDecoder::~CUDAJpegDecoder() { + /* + The below code works on Mac and Linux, but fails on Windows. + This is because on Windows, the atexit hook which calls this + destructor executes after cuda is already shut down causing SIGSEGV. + We do not have a solution to this problem at the moment, so we'll + just leak the libnvjpeg & cuda variables for the time being and hope + that the CUDA runtime handles cleanup for us. + Please send a PR if you have a solution for this problem. + */ + + // nvjpegStatus_t status; + + // status = nvjpegDecodeParamsDestroy(nvjpeg_decode_params); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy nvjpeg decode params: ", + // status); + + // status = nvjpegJpegStreamDestroy(jpeg_streams[0]); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy jpeg stream: ", + // status); + + // status = nvjpegJpegStreamDestroy(jpeg_streams[1]); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy jpeg stream: ", + // status); + + // status = nvjpegBufferPinnedDestroy(pinned_buffers[0]); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy pinned buffer[0]: ", + // status); + + // status = nvjpegBufferPinnedDestroy(pinned_buffers[1]); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy pinned buffer[1]: ", + // status); + + // status = nvjpegBufferDeviceDestroy(device_buffer); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy device buffer: ", + // status); + + // status = nvjpegJpegStateDestroy(nvjpeg_decoupled_state); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy nvjpeg decoupled state: ", + // status); + + // status = nvjpegDecoderDestroy(nvjpeg_decoder); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy nvjpeg decoder: ", + // status); + + // status = nvjpegJpegStateDestroy(nvjpeg_state); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy nvjpeg state: ", + // status); + + // status = nvjpegDestroy(nvjpeg_handle); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, "nvjpegDestroy failed: ", status); +} + +std::tuple< + std::vector, + std::vector, + std::vector> +CUDAJpegDecoder::prepare_buffers( + const std::vector& encoded_images, + const nvjpegOutputFormat_t& output_format) { + /* + This function scans the encoded images' jpeg headers and + allocates decoding buffers based on the metadata found + + Args: + - encoded_images (std::vector): a vector of tensors + containing the jpeg bitstreams to be decoded. Each tensor must have dtype + torch.uint8 and device cpu + - output_format (nvjpegOutputFormat_t): NVJPEG_OUTPUT_RGB, NVJPEG_OUTPUT_Y + or NVJPEG_OUTPUT_UNCHANGED + + Returns: + - decoded_images (std::vector): a vector of nvjpegImages + containing pointers to the memory of the decoded images + - output_tensors (std::vector): a vector of Tensors + containing the decoded images. `decoded_images` points to the memory of + output_tensors + - channels (std::vector): a vector of ints containing the number of + output image channels for every image + */ + + int width[NVJPEG_MAX_COMPONENT]; + int height[NVJPEG_MAX_COMPONENT]; + std::vector channels(encoded_images.size()); + nvjpegChromaSubsampling_t subsampling; + nvjpegStatus_t status; + + std::vector output_tensors{encoded_images.size()}; + std::vector decoded_images{encoded_images.size()}; + + for (std::vector::size_type i = 0; i < encoded_images.size(); + i++) { + // extract bitstream meta data to figure out the number of channels, height, + // width for every image + status = nvjpegGetImageInfo( + nvjpeg_handle, + (unsigned char*)encoded_images[i].data_ptr(), + encoded_images[i].numel(), + &channels[i], + &subsampling, + width, + height); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, "Failed to get image info: ", status); + + TORCH_CHECK( + subsampling != NVJPEG_CSS_UNKNOWN, "Unknown chroma subsampling"); + + // output channels may be different from the actual number of channels in + // the image, e.g. we decode a grayscale image as RGB and slice off the + // extra channels later + int output_channels = 3; + if (output_format == NVJPEG_OUTPUT_RGB || + output_format == NVJPEG_OUTPUT_UNCHANGED) { + output_channels = 3; + } else if (output_format == NVJPEG_OUTPUT_Y) { + output_channels = 1; + } + + // reserve output buffer + auto output_tensor = torch::empty( + {int64_t(output_channels), int64_t(height[0]), int64_t(width[0])}, + torch::dtype(torch::kU8).device(target_device)); + output_tensors[i] = output_tensor; + + // fill nvjpegImage_t struct + for (int c = 0; c < output_channels; c++) { + decoded_images[i].channel[c] = output_tensor[c].data_ptr(); + decoded_images[i].pitch[c] = width[0]; + } + for (int c = output_channels; c < NVJPEG_MAX_COMPONENT; c++) { + decoded_images[i].channel[c] = NULL; + decoded_images[i].pitch[c] = 0; + } + } + return {decoded_images, output_tensors, channels}; +} + +std::vector CUDAJpegDecoder::decode_images( + const std::vector& encoded_images, + const nvjpegOutputFormat_t& output_format) { + /* + This function decodes a batch of jpeg bitstreams. + We scan all encoded bitstreams and sort them into two groups: + 1. Baseline JPEGs: Can be decoded with hardware support on A100+ GPUs. + 2. Other JPEGs (e.g. progressive JPEGs): Can also be decoded on the + GPU (albeit with software support only) but need some preprocessing on the + host first. + + See + https://github.com/NVIDIA/CUDALibrarySamples/blob/f17940ac4e705bf47a8c39f5365925c1665f6c98/nvJPEG/nvJPEG-Decoder/nvjpegDecoder.cpp#L33 + for reference. + + Args: + - encoded_images (std::vector): a vector of tensors + containing the jpeg bitstreams to be decoded + - output_format (nvjpegOutputFormat_t): NVJPEG_OUTPUT_RGB, NVJPEG_OUTPUT_Y + or NVJPEG_OUTPUT_UNCHANGED + - device (torch::Device): The desired CUDA device for the returned Tensors + + Returns: + - output_tensors (std::vector): a vector of Tensors + containing the decoded images + */ + + auto [decoded_imgs_buf, output_tensors, channels] = + prepare_buffers(encoded_images, output_format); + + nvjpegStatus_t status; + cudaError_t cudaStatus; + + cudaStatus = cudaStreamSynchronize(stream); + TORCH_CHECK( + cudaStatus == cudaSuccess, + "Failed to synchronize CUDA stream: ", + cudaStatus); + + // baseline JPEGs can be batch decoded with hardware support on A100+ GPUs + // ultra fast! + std::vector hw_input_buffer; + std::vector hw_input_buffer_size; + std::vector hw_output_buffer; + + // other JPEG types such as progressive JPEGs can be decoded one-by-one in + // software slow :( + std::vector sw_input_buffer; + std::vector sw_input_buffer_size; + std::vector sw_output_buffer; + + if (hw_decode_available) { + for (std::vector::size_type i = 0; i < encoded_images.size(); + ++i) { + // extract bitstream meta data to figure out whether a bit-stream can be + // decoded + nvjpegJpegStreamParseHeader( + nvjpeg_handle, + encoded_images[i].data_ptr(), + encoded_images[i].numel(), + jpeg_streams[0]); + int isSupported = -1; + nvjpegDecodeBatchedSupported( + nvjpeg_handle, jpeg_streams[0], &isSupported); + + if (isSupported == 0) { + hw_input_buffer.push_back(encoded_images[i].data_ptr()); + hw_input_buffer_size.push_back(encoded_images[i].numel()); + hw_output_buffer.push_back(decoded_imgs_buf[i]); + } else { + sw_input_buffer.push_back(encoded_images[i].data_ptr()); + sw_input_buffer_size.push_back(encoded_images[i].numel()); + sw_output_buffer.push_back(decoded_imgs_buf[i]); + } + } + } else { + for (std::vector::size_type i = 0; i < encoded_images.size(); + ++i) { + sw_input_buffer.push_back(encoded_images[i].data_ptr()); + sw_input_buffer_size.push_back(encoded_images[i].numel()); + sw_output_buffer.push_back(decoded_imgs_buf[i]); + } + } + + if (hw_input_buffer.size() > 0) { + // UNCHANGED behaves weird, so we use RGB instead + status = nvjpegDecodeBatchedInitialize( + nvjpeg_handle, + nvjpeg_state, + hw_input_buffer.size(), + 1, + output_format == NVJPEG_OUTPUT_UNCHANGED ? NVJPEG_OUTPUT_RGB + : output_format); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to initialize batch decoding: ", + status); + + status = nvjpegDecodeBatched( + nvjpeg_handle, + nvjpeg_state, + hw_input_buffer.data(), + hw_input_buffer_size.data(), + hw_output_buffer.data(), + stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, "Failed to decode batch: ", status); + } + + if (sw_input_buffer.size() > 0) { + status = + nvjpegStateAttachDeviceBuffer(nvjpeg_decoupled_state, device_buffer); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to attach device buffer: ", + status); + int buffer_index = 0; + // UNCHANGED behaves weird, so we use RGB instead + status = nvjpegDecodeParamsSetOutputFormat( + nvjpeg_decode_params, + output_format == NVJPEG_OUTPUT_UNCHANGED ? NVJPEG_OUTPUT_RGB + : output_format); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to set output format: ", + status); + for (std::vector::size_type i = 0; i < sw_input_buffer.size(); + ++i) { + status = nvjpegJpegStreamParse( + nvjpeg_handle, + sw_input_buffer[i], + sw_input_buffer_size[i], + 0, + 0, + jpeg_streams[buffer_index]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to parse jpeg stream: ", + status); + + status = nvjpegStateAttachPinnedBuffer( + nvjpeg_decoupled_state, pinned_buffers[buffer_index]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to attach pinned buffer: ", + status); + + status = nvjpegDecodeJpegHost( + nvjpeg_handle, + nvjpeg_decoder, + nvjpeg_decoupled_state, + nvjpeg_decode_params, + jpeg_streams[buffer_index]); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to decode jpeg stream: ", + status); + + cudaStatus = cudaStreamSynchronize(stream); + TORCH_CHECK( + cudaStatus == cudaSuccess, + "Failed to synchronize CUDA stream: ", + cudaStatus); + + status = nvjpegDecodeJpegTransferToDevice( + nvjpeg_handle, + nvjpeg_decoder, + nvjpeg_decoupled_state, + jpeg_streams[buffer_index], + stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to transfer jpeg to device: ", + status); + + buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode + // to avoid an extra sync + + status = nvjpegDecodeJpegDevice( + nvjpeg_handle, + nvjpeg_decoder, + nvjpeg_decoupled_state, + &sw_output_buffer[i], + stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to decode jpeg stream: ", + status); + } + } + + cudaStatus = cudaStreamSynchronize(stream); + TORCH_CHECK( + cudaStatus == cudaSuccess, + "Failed to synchronize CUDA stream: ", + cudaStatus); + + // prune extraneous channels from single channel images + if (output_format == NVJPEG_OUTPUT_UNCHANGED) { + for (std::vector::size_type i = 0; i < output_tensors.size(); + ++i) { + if (channels[i] == 1) { + output_tensors[i] = output_tensors[i][0].unsqueeze(0).clone(); + } + } + } + + return output_tensors; +} + +} // namespace image +} // namespace vision + +#endif diff --git a/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h new file mode 100644 index 00000000000..6f72d9e35b2 --- /dev/null +++ b/torchvision/csrc/io/image/cuda/decode_jpegs_cuda.h @@ -0,0 +1,45 @@ +#pragma once +#include +#include +#include "../common.h" + +#if NVJPEG_FOUND +#include +#include + +namespace vision { +namespace image { +class CUDAJpegDecoder { + public: + CUDAJpegDecoder(const torch::Device& target_device); + ~CUDAJpegDecoder(); + + std::vector decode_images( + const std::vector& encoded_images, + const nvjpegOutputFormat_t& output_format); + + const torch::Device original_device; + const torch::Device target_device; + const c10::cuda::CUDAStream stream; + + private: + std::tuple< + std::vector, + std::vector, + std::vector> + prepare_buffers( + const std::vector& encoded_images, + const nvjpegOutputFormat_t& output_format); + nvjpegJpegState_t nvjpeg_state; + nvjpegJpegState_t nvjpeg_decoupled_state; + nvjpegBufferPinned_t pinned_buffers[2]; + nvjpegBufferDevice_t device_buffer; + nvjpegJpegStream_t jpeg_streams[2]; + nvjpegDecodeParams_t nvjpeg_decode_params; + nvjpegJpegDecoder_t nvjpeg_decoder; + bool hw_decode_available{false}; + nvjpegHandle_t nvjpeg_handle; +}; +} // namespace image +} // namespace vision +#endif diff --git a/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h new file mode 100644 index 00000000000..8c3ad8f9a9d --- /dev/null +++ b/torchvision/csrc/io/image/cuda/encode_decode_jpegs_cuda.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include "../common.h" +#include "decode_jpegs_cuda.h" +#include "encode_jpegs_cuda.h" + +namespace vision { +namespace image { + +/* + +Fast jpeg decoding with CUDA. +A100+ GPUs have dedicated hardware support for jpeg decoding. + +Args: + - encoded_images (const std::vector&): a vector of tensors + containing the jpeg bitstreams to be decoded. Each tensor must have dtype + torch.uint8 and device cpu + - mode (ImageReadMode): IMAGE_READ_MODE_UNCHANGED, IMAGE_READ_MODE_GRAY and +IMAGE_READ_MODE_RGB are supported + - device (torch::Device): The desired CUDA device to run the decoding on and +which will contain the output tensors + +Returns: + - decoded_images (std::vector): a vector of torch::Tensors of +dtype torch.uint8 on the specified containing the decoded images + +Notes: + - If a single image fails, the whole batch fails. + - This function is thread-safe +*/ +C10_EXPORT std::vector decode_jpegs_cuda( + const std::vector& encoded_images, + vision::image::ImageReadMode mode, + torch::Device device); + +/* +Fast jpeg encoding with CUDA. + +Args: + - decoded_images (const std::vector&): a vector of contiguous +CUDA tensors of dtype torch.uint8 to be encoded. + - quality (int64_t): 0-100, 75 is the default + +Returns: + - encoded_images (std::vector): a vector of CUDA +torch::Tensors of dtype torch.uint8 containing the encoded images + +Notes: + - If a single image fails, the whole batch fails. + - This function is thread-safe +*/ +C10_EXPORT std::vector encode_jpegs_cuda( + const std::vector& decoded_images, + const int64_t quality); + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp new file mode 100644 index 00000000000..80accc1a241 --- /dev/null +++ b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.cpp @@ -0,0 +1,278 @@ +#include "encode_jpegs_cuda.h" +#if !NVJPEG_FOUND +namespace vision { +namespace image { +std::vector encode_jpegs_cuda( + const std::vector& decoded_images, + const int64_t quality) { + TORCH_CHECK( + false, "encode_jpegs_cuda: torchvision not compiled with nvJPEG support"); +} +} // namespace image +} // namespace vision +#else + +#include +#include +#include +#include +#include +#include "c10/core/ScalarType.h" + +namespace vision { +namespace image { + +// We use global variables to cache the encoder and decoder instances and +// reuse them across calls to the corresponding pytorch functions +std::mutex encoderMutex; +std::unique_ptr cudaJpegEncoder; + +std::vector encode_jpegs_cuda( + const std::vector& decoded_images, + const int64_t quality) { + C10_LOG_API_USAGE_ONCE( + "torchvision.csrc.io.image.cuda.encode_jpegs_cuda.encode_jpegs_cuda"); + + // Some nvjpeg structures are not thread safe so we're keeping it single + // threaded for now. In the future this may be an opportunity to unlock + // further speedups + std::lock_guard lock(encoderMutex); + TORCH_CHECK(decoded_images.size() > 0, "Empty input tensor list"); + torch::Device device = decoded_images[0].device(); + at::cuda::CUDAGuard device_guard(device); + + // lazy init of the encoder class + // the encoder object holds on to a lot of state and is expensive to create, + // so we reuse it across calls. NB: the cached structures are device specific + // and cannot be reused across devices + if (cudaJpegEncoder == nullptr || device != cudaJpegEncoder->target_device) { + if (cudaJpegEncoder != nullptr) { + delete cudaJpegEncoder.release(); + } + + cudaJpegEncoder = std::make_unique(device); + + // Unfortunately, we cannot rely on the smart pointer releasing the encoder + // object correctly upon program exit. This is because, when cudaJpegEncoder + // gets destroyed, the CUDA runtime may already be shut down, rendering all + // destroy* calls in the encoder destructor invalid. Instead, we use an + // atexit hook which executes after main() finishes, but hopefully before + // CUDA shuts down when the program exits. If CUDA is already shut down the + // destructor will detect this and will not attempt to destroy any encoder + // structures. + std::atexit([]() { delete cudaJpegEncoder.release(); }); + } + + std::vector contig_images; + contig_images.reserve(decoded_images.size()); + for (const auto& image : decoded_images) { + TORCH_CHECK( + image.dtype() == torch::kU8, "Input tensor dtype should be uint8"); + + TORCH_CHECK( + image.device() == device, + "All input tensors must be on the same CUDA device when encoding with nvjpeg") + + TORCH_CHECK( + image.dim() == 3 && image.numel() > 0, + "Input data should be a 3-dimensional tensor"); + + TORCH_CHECK( + image.size(0) == 3, + "The number of channels should be 3, got: ", + image.size(0)); + + // nvjpeg requires images to be contiguous + if (image.is_contiguous()) { + contig_images.push_back(image); + } else { + contig_images.push_back(image.contiguous()); + } + } + + cudaJpegEncoder->set_quality(quality); + std::vector encoded_images; + for (const auto& image : contig_images) { + auto encoded_image = cudaJpegEncoder->encode_jpeg(image); + encoded_images.push_back(encoded_image); + } + at::cuda::CUDAEvent event; + event.record(cudaJpegEncoder->stream); + + // We use a dedicated stream to do the encoding and even though the results + // may be ready on that stream we cannot assume that they are also available + // on the current stream of the calling context when this function returns. We + // use a blocking event to ensure that this is indeed the case. Crucially, we + // do not want to block the host at this particular point + // (which is what cudaStreamSynchronize would do.) Events allow us to + // synchronize the streams without blocking the host. + event.block(cudaJpegEncoder->current_stream); + return encoded_images; +} + +CUDAJpegEncoder::CUDAJpegEncoder(const torch::Device& target_device) + : original_device{torch::kCUDA, torch::cuda::current_device()}, + target_device{target_device}, + stream{ + target_device.has_index() + ? at::cuda::getStreamFromPool(false, target_device.index()) + : at::cuda::getStreamFromPool(false)}, + current_stream{ + original_device.has_index() + ? at::cuda::getCurrentCUDAStream(original_device.index()) + : at::cuda::getCurrentCUDAStream()} { + nvjpegStatus_t status; + status = nvjpegCreateSimple(&nvjpeg_handle); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create nvjpeg handle: ", + status); + + status = nvjpegEncoderStateCreate(nvjpeg_handle, &nv_enc_state, stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create nvjpeg encoder state: ", + status); + + status = nvjpegEncoderParamsCreate(nvjpeg_handle, &nv_enc_params, stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to create nvjpeg encoder params: ", + status); +} + +CUDAJpegEncoder::~CUDAJpegEncoder() { + /* + The below code works on Mac and Linux, but fails on Windows. + This is because on Windows, the atexit hook which calls this + destructor executes after cuda is already shut down causing SIGSEGV. + We do not have a solution to this problem at the moment, so we'll + just leak the libnvjpeg & cuda variables for the time being and hope + that the CUDA runtime handles cleanup for us. + Please send a PR if you have a solution for this problem. + */ + + // // We run cudaGetDeviceCount as a dummy to test if the CUDA runtime is + // still + // // initialized. If it is not, we can skip the rest of this function as it + // is + // // unsafe to execute. + // int deviceCount = 0; + // cudaError_t error = cudaGetDeviceCount(&deviceCount); + // if (error != cudaSuccess) + // return; // CUDA runtime has already shut down. There's nothing we can do + // // now. + + // nvjpegStatus_t status; + + // status = nvjpegEncoderParamsDestroy(nv_enc_params); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy nvjpeg encoder params: ", + // status); + + // status = nvjpegEncoderStateDestroy(nv_enc_state); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, + // "Failed to destroy nvjpeg encoder state: ", + // status); + + // cudaStreamSynchronize(stream); + + // status = nvjpegDestroy(nvjpeg_handle); + // TORCH_CHECK( + // status == NVJPEG_STATUS_SUCCESS, "nvjpegDestroy failed: ", status); +} + +torch::Tensor CUDAJpegEncoder::encode_jpeg(const torch::Tensor& src_image) { + nvjpegStatus_t status; + cudaError_t cudaStatus; + + // Ensure that the incoming src_image is safe to use + cudaStatus = cudaStreamSynchronize(current_stream); + TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus); + + int channels = src_image.size(0); + int height = src_image.size(1); + int width = src_image.size(2); + + status = nvjpegEncoderParamsSetSamplingFactors( + nv_enc_params, NVJPEG_CSS_444, stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to set nvjpeg encoder params sampling factors: ", + status); + + nvjpegImage_t target_image; + for (int c = 0; c < channels; c++) { + target_image.channel[c] = src_image[c].data_ptr(); + // this is why we need contiguous tensors + target_image.pitch[c] = width; + } + for (int c = channels; c < NVJPEG_MAX_COMPONENT; c++) { + target_image.channel[c] = nullptr; + target_image.pitch[c] = 0; + } + // Encode the image + status = nvjpegEncodeImage( + nvjpeg_handle, + nv_enc_state, + nv_enc_params, + &target_image, + NVJPEG_INPUT_RGB, + width, + height, + stream); + + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, "image encoding failed: ", status); + // Retrieve length of the encoded image + size_t length; + status = nvjpegEncodeRetrieveBitstreamDevice( + nvjpeg_handle, nv_enc_state, NULL, &length, stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to retrieve encoded image stream state: ", + status); + + // Synchronize the stream to ensure that the encoded image is ready + cudaStatus = cudaStreamSynchronize(stream); + TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus); + + // Reserve buffer for the encoded image + torch::Tensor encoded_image = torch::empty( + {static_cast(length)}, + torch::TensorOptions() + .dtype(torch::kByte) + .layout(torch::kStrided) + .device(target_device) + .requires_grad(false)); + cudaStatus = cudaStreamSynchronize(stream); + TORCH_CHECK(cudaStatus == cudaSuccess, "CUDA ERROR: ", cudaStatus); + // Retrieve the encoded image + status = nvjpegEncodeRetrieveBitstreamDevice( + nvjpeg_handle, + nv_enc_state, + encoded_image.data_ptr(), + &length, + stream); + TORCH_CHECK( + status == NVJPEG_STATUS_SUCCESS, + "Failed to retrieve encoded image: ", + status); + return encoded_image; +} + +void CUDAJpegEncoder::set_quality(const int64_t quality) { + nvjpegStatus_t paramsQualityStatus = + nvjpegEncoderParamsSetQuality(nv_enc_params, quality, stream); + TORCH_CHECK( + paramsQualityStatus == NVJPEG_STATUS_SUCCESS, + "Failed to set nvjpeg encoder params quality: ", + paramsQualityStatus); +} + +} // namespace image +} // namespace vision + +#endif // NVJPEG_FOUND diff --git a/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h new file mode 100644 index 00000000000..6ee0ad91df4 --- /dev/null +++ b/torchvision/csrc/io/image/cuda/encode_jpegs_cuda.h @@ -0,0 +1,34 @@ +#pragma once +#include +#include +#if NVJPEG_FOUND + +#include +#include +#include + +namespace vision { +namespace image { + +class CUDAJpegEncoder { + public: + CUDAJpegEncoder(const torch::Device& device); + ~CUDAJpegEncoder(); + + torch::Tensor encode_jpeg(const torch::Tensor& src_image); + + void set_quality(const int64_t quality); + + const torch::Device original_device; + const torch::Device target_device; + const c10::cuda::CUDAStream stream; + const c10::cuda::CUDAStream current_stream; + + protected: + nvjpegEncoderState_t nv_enc_state; + nvjpegEncoderParams_t nv_enc_params; + nvjpegHandle_t nvjpeg_handle; +}; +} // namespace image +} // namespace vision +#endif diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp index 3c9d632f030..b4a4ed54a67 100644 --- a/torchvision/csrc/io/image/image.cpp +++ b/torchvision/csrc/io/image/image.cpp @@ -1,33 +1,29 @@ #include "image.h" #include -#ifdef USE_PYTHON -#include -#endif - -// If we are in a Windows environment, we need to define -// initialization functions for the _custom_ops extension -#ifdef USE_PYTHON -#ifdef _WIN32 -PyMODINIT_FUNC PyInit_image(void) { - // No need to do anything. - return NULL; -} -#endif -#endif // USE_PYTHON namespace vision { namespace image { -static auto registry = torch::RegisterOperators() - .op("image::decode_png", &decode_png) - .op("image::encode_png", &encode_png) - .op("image::decode_jpeg", &decode_jpeg) - .op("image::encode_jpeg", &encode_jpeg) - .op("image::read_file", &read_file) - .op("image::write_file", &write_file) - .op("image::decode_image", &decode_image) - .op("image::decode_jpeg_cuda", &decode_jpeg_cuda); +static auto registry = + torch::RegisterOperators() + .op("image::decode_gif", &decode_gif) + .op("image::decode_png(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor", + &decode_png) + .op("image::encode_png", &encode_png) + .op("image::decode_jpeg(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor", + &decode_jpeg) + .op("image::decode_webp(Tensor encoded_data, int mode) -> Tensor", + &decode_webp) + .op("image::encode_jpeg", &encode_jpeg) + .op("image::read_file", &read_file) + .op("image::write_file", &write_file) + .op("image::decode_image(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor", + &decode_image) + .op("image::decode_jpegs_cuda", &decode_jpegs_cuda) + .op("image::encode_jpegs_cuda", &encode_jpegs_cuda) + .op("image::_jpeg_version", &_jpeg_version) + .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/image.h b/torchvision/csrc/io/image/image.h index 05bac44c77d..3f47fdec65c 100644 --- a/torchvision/csrc/io/image/image.h +++ b/torchvision/csrc/io/image/image.h @@ -1,9 +1,11 @@ #pragma once +#include "cpu/decode_gif.h" #include "cpu/decode_image.h" #include "cpu/decode_jpeg.h" #include "cpu/decode_png.h" +#include "cpu/decode_webp.h" #include "cpu/encode_jpeg.h" #include "cpu/encode_png.h" #include "cpu/read_write_file.h" -#include "cuda/decode_jpeg_cuda.h" +#include "cuda/encode_decode_jpegs_cuda.h" diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp index 38b35014595..8f1fb3fb5b9 100644 --- a/torchvision/csrc/io/video/video.cpp +++ b/torchvision/csrc/io/video/video.cpp @@ -2,6 +2,8 @@ #include +using namespace ffmpeg; + namespace vision { namespace video { @@ -77,7 +79,7 @@ std::tuple _parseStream(const std::string& streamString) { long index_ = -1; if (match[2].matched) { try { - index_ = c10::stoi(match[2].str()); + index_ = std::stoi(match[2].str()); } catch (const std::exception&) { TORCH_CHECK( false, @@ -156,14 +158,34 @@ void Video::_getDecoderParams( } // _get decoder params -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); +void Video::initFromFile( + std::string videoPath, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + params.uri = videoPath; + _init(stream, numThreads); +} + +void Video::initFromMemory( + torch::Tensor videoTensor, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + callback = MemoryBuffer::getCallback( + videoTensor.data_ptr(), videoTensor.size(0)); + _init(stream, numThreads); +} + +void Video::_init(std::string stream, int64_t numThreads) { // set number of threads global numThreads_ = numThreads; // parse stream information current_stream = _parseStream(stream); // note that in the initial call we want to get all streams - Video::_getDecoderParams( + _getDecoderParams( 0, // video start 0, // headerOnly std::get<0>(current_stream), // stream info - remove that @@ -175,11 +197,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { std::string logMessage, logType; - // TODO: add read from memory option - params.uri = videoPath; - logType = "file"; - logMessage = videoPath; - // locals std::vector audioFPS, videoFPS; std::vector audioDuration, videoDuration, ccDuration, subsDuration; @@ -190,7 +207,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { c10::Dict> subsMetadata; // callback and metadata defined in struct - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); if (succeeded) { for (const auto& header : metadata) { double fps = double(header.fps); @@ -225,16 +243,23 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { streamsMetadata.insert("subtitles", subsMetadata); streamsMetadata.insert("cc", ccMetadata); - succeeded = Video::setCurrentStream(stream); - LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n"; + succeeded = setCurrentStream(stream); if (std::get<1>(current_stream) != -1) { LOG(INFO) << "Stream index set to " << std::get<1>(current_stream) << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; } +} + +Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { + C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); + if (!videoPath.empty()) { + initFromFile(videoPath, stream, numThreads); + } } // video bool Video::setCurrentStream(std::string stream = "video") { + TORCH_CHECK(initialized, "Video object has to be initialized first"); if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { current_stream = _parseStream(stream); } @@ -256,19 +281,23 @@ bool Video::setCurrentStream(std::string stream = "video") { ); // callback and metadata defined in Video.h - return (decoder.init(params, std::move(callback), &metadata)); + DecoderInCallback tmp_callback = callback; + return (decoder.init(params, std::move(tmp_callback), &metadata)); } std::tuple Video::getCurrentStream() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return current_stream; } c10::Dict>> Video:: getStreamMetadata() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return streamsMetadata; } void Video::Seek(double ts, bool fastSeek = false) { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // initialize the class variables used for seeking and retrurn _getDecoderParams( ts, // video start @@ -282,20 +311,21 @@ void Video::Seek(double ts, bool fastSeek = false) { ); // callback and metadata defined in Video.h - succeeded = decoder.init(params, std::move(callback), &metadata); - LOG(INFO) << "Decoder init at seek " << succeeded << "\n"; + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); } std::tuple Video::Next() { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // if failing to decode simply return a null tensor (note, should we - // raise an exeption?) + // raise an exception?) double frame_pts_s; torch::Tensor outFrame = torch::zeros({0}, torch::kByte); // decode single frame DecoderOutputMessage out; int64_t res = decoder.decode(&out, decoderTimeoutMs); - // if successfull + // if successful if (res == 0) { frame_pts_s = double(double(out.header.pts) * 1e-6); @@ -345,6 +375,8 @@ std::tuple Video::Next() { static auto registerVideo = torch::class_